diff options
author | Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> | 2015-08-14 11:38:45 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> | 2015-08-14 17:16:47 +0000 |
commit | 3a97ca8dd9b96b599ae2d33e40df0dd2f7ea5859 (patch) | |
tree | 43cc572ba067417c7341db81f71ae7cc6e0fcc3e /chromium/third_party/libvpx | |
parent | f61ab1ac7f855cd281809255c0aedbb1895e1823 (diff) | |
download | qtwebengine-chromium-3a97ca8dd9b96b599ae2d33e40df0dd2f7ea5859.tar.gz |
BASELINE: Update chromium to 45.0.2454.40
Change-Id: Id2121d9f11a8fc633677236c65a3e41feef589e4
Reviewed-by: Andras Becsi <andras.becsi@theqtcompany.com>
Diffstat (limited to 'chromium/third_party/libvpx')
347 files changed, 22658 insertions, 13118 deletions
diff --git a/chromium/third_party/libvpx/BUILD.gn b/chromium/third_party/libvpx/BUILD.gn index bb55816b80b..d33761594d9 100644 --- a/chromium/third_party/libvpx/BUILD.gn +++ b/chromium/third_party/libvpx/BUILD.gn @@ -89,7 +89,7 @@ static_library("libvpx_intrinsics_sse2") { configs += [ ":libvpx_config" ] configs -= [ "//build/config/compiler:chromium_code" ] configs += [ "//build/config/compiler:no_chromium_code" ] - if (!is_win) { + if (!is_win || is_clang) { cflags = [ "-msse2" ] } if (current_cpu == "x86") { @@ -103,7 +103,7 @@ static_library("libvpx_intrinsics_ssse3") { configs += [ ":libvpx_config" ] configs -= [ "//build/config/compiler:chromium_code" ] configs += [ "//build/config/compiler:no_chromium_code" ] - if (!is_win) { + if (!is_win || is_clang) { cflags = [ "-mssse3" ] } if (current_cpu == "x86") { @@ -117,7 +117,7 @@ static_library("libvpx_intrinsics_sse4_1") { configs += [ ":libvpx_config" ] configs -= [ "//build/config/compiler:chromium_code" ] configs += [ "//build/config/compiler:no_chromium_code" ] - if (!is_win) { + if (!is_win || is_clang) { cflags = [ "-msse4.1" ] } if (current_cpu == "x86") { diff --git a/chromium/third_party/libvpx/README.chromium b/chromium/third_party/libvpx/README.chromium index a8025f3705c..af0dcef0703 100644 --- a/chromium/third_party/libvpx/README.chromium +++ b/chromium/third_party/libvpx/README.chromium @@ -5,9 +5,9 @@ License: BSD License File: source/libvpx/LICENSE Security Critical: yes -Date: Monday May 11 2015 +Date: Monday June 29 2015 Branch: master -Commit: 6d227137221ea4eead9d81daf3d2f0915560bbff +Commit: f3a1295cffe62806255bace4abb49c8831b7a61f Description: Contains the sources used to compile libvpx binaries used by Google Chrome and diff --git a/chromium/third_party/libvpx/codereview.settings b/chromium/third_party/libvpx/codereview.settings index db0d04cc885..0e7d0e87548 100644 --- a/chromium/third_party/libvpx/codereview.settings +++ b/chromium/third_party/libvpx/codereview.settings @@ -1,4 +1,9 @@ -# This file is used by gcl to get repository specific information. CODE_REVIEW_SERVER: codereview.chromium.org +CC_LIST: chromium-reviews@chromium.org VIEW_VC: https://chromium.googlesource.com/chromium/deps/libvpx/+/ STATUS: http://chromium-status.appspot.com/status +TRY_ON_UPLOAD: False +TRYSERVER_SVN_URL: svn://svn.chromium.org/chrome-try/try +GITCL_PREUPLOAD: http://src.chromium.org/viewvc/chrome/trunk/tools/depot_tools/git-cl-upload-hook?revision=HEAD +GITCL_PREDCOMMIT: http://src.chromium.org/viewvc/chrome/trunk/tools/depot_tools/git-cl-upload-hook?revision=HEAD +PROJECT: chromium_deps diff --git a/chromium/third_party/libvpx/libvpx_srcs.gni b/chromium/third_party/libvpx/libvpx_srcs.gni index 39f445180ea..1f94e7ce601 100644 --- a/chromium/third_party/libvpx/libvpx_srcs.gni +++ b/chromium/third_party/libvpx/libvpx_srcs.gni @@ -188,6 +188,7 @@ libvpx_srcs_x86 = [ "//third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.h", "//third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c", "//third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.h", + "//third_party/libvpx/source/libvpx/vp9/common/x86/convolve.h", "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c", "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h", "//third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c", @@ -280,7 +281,10 @@ libvpx_srcs_x86 = [ "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_write_bit_buffer.h", "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.c", "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.h", + "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2_impl.h", + "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.h", + "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2_impl.h", "//third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c", "//third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c", "//third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h", @@ -301,6 +305,7 @@ libvpx_srcs_x86 = [ "//third_party/libvpx/source/libvpx/vpx/vpx_image.h", "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/sad.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/variance.c", "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c", "//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h", "//third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c", @@ -309,6 +314,7 @@ libvpx_srcs_x86 = [ "//third_party/libvpx/source/libvpx/vpx_ports/mem.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops_aligned.h", + "//third_party/libvpx/source/libvpx/vpx_ports/msvc.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h", "//third_party/libvpx/source/libvpx/vpx_ports/x86.h", @@ -338,9 +344,9 @@ libvpx_srcs_x86_assembly = [ "//third_party/libvpx/source/libvpx/vp8/common/x86/subpixel_mmx.asm", "//third_party/libvpx/source/libvpx/vp8/common/x86/subpixel_sse2.asm", "//third_party/libvpx/source/libvpx/vp8/common/x86/subpixel_ssse3.asm", - "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_mmx.asm", "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_sse2.asm", "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_ssse3.asm", + "//third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_impl_mmx.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/dct_mmx.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/dct_sse2.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/encodeopt.asm", @@ -369,30 +375,31 @@ libvpx_srcs_x86_assembly = [ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse3.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse4.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_ssse3.asm", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_impl_mmx.asm", "//third_party/libvpx/source/libvpx/vpx_ports/emms.asm", "//third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm", ] libvpx_srcs_x86_mmx = [ "//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c", - "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_mmx.c", + "//third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_mmx.c", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_mmx.c", ] libvpx_srcs_x86_sse2 = [ "//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_sse2.c", "//third_party/libvpx/source/libvpx/vp8/common/x86/recon_wrapper_sse2.c", - "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_sse2.c", + "//third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_sse2.c", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse2.c", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c", "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c", "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_sse2.c", ] libvpx_srcs_x86_sse3 = [ ] @@ -410,14 +417,14 @@ libvpx_srcs_x86_avx = [ libvpx_srcs_x86_avx2 = [ "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c", "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_avx2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_impl_avx2.c", ] libvpx_srcs_x86_64 = [ "//third_party/libvpx/source/libvpx/vp8/common/alloccommon.c", @@ -604,6 +611,7 @@ libvpx_srcs_x86_64 = [ "//third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.h", "//third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c", "//third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.h", + "//third_party/libvpx/source/libvpx/vp9/common/x86/convolve.h", "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c", "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h", "//third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c", @@ -696,7 +704,10 @@ libvpx_srcs_x86_64 = [ "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_write_bit_buffer.h", "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.c", "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.h", + "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2_impl.h", + "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.h", + "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2_impl.h", "//third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c", "//third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c", "//third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h", @@ -717,6 +728,7 @@ libvpx_srcs_x86_64 = [ "//third_party/libvpx/source/libvpx/vpx/vpx_image.h", "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/sad.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/variance.c", "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c", "//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h", "//third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c", @@ -725,6 +737,7 @@ libvpx_srcs_x86_64 = [ "//third_party/libvpx/source/libvpx/vpx_ports/mem.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops_aligned.h", + "//third_party/libvpx/source/libvpx/vpx_ports/msvc.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h", "//third_party/libvpx/source/libvpx/vpx_ports/x86.h", @@ -755,9 +768,9 @@ libvpx_srcs_x86_64_assembly = [ "//third_party/libvpx/source/libvpx/vp8/common/x86/subpixel_mmx.asm", "//third_party/libvpx/source/libvpx/vp8/common/x86/subpixel_sse2.asm", "//third_party/libvpx/source/libvpx/vp8/common/x86/subpixel_ssse3.asm", - "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_mmx.asm", "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_sse2.asm", "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_ssse3.asm", + "//third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_impl_mmx.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/dct_mmx.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/dct_sse2.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/encodeopt.asm", @@ -791,30 +804,31 @@ libvpx_srcs_x86_64_assembly = [ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse3.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse4.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_ssse3.asm", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_impl_mmx.asm", "//third_party/libvpx/source/libvpx/vpx_ports/emms.asm", "//third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm", ] libvpx_srcs_x86_64_mmx = [ "//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c", - "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_mmx.c", + "//third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_mmx.c", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_mmx.c", ] libvpx_srcs_x86_64_sse2 = [ "//third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_sse2.c", "//third_party/libvpx/source/libvpx/vp8/common/x86/recon_wrapper_sse2.c", - "//third_party/libvpx/source/libvpx/vp8/common/x86/variance_sse2.c", + "//third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_sse2.c", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse2.c", "//third_party/libvpx/source/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c", "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c", "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_sse2.c", ] libvpx_srcs_x86_64_sse3 = [ ] @@ -832,14 +846,14 @@ libvpx_srcs_x86_64_avx = [ libvpx_srcs_x86_64_avx2 = [ "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c", "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c", "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c", - "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_avx2.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_impl_avx2.c", ] libvpx_srcs_arm = [ "//third_party/libvpx/source/libvpx/vp8/common/alloccommon.c", @@ -1140,6 +1154,7 @@ libvpx_srcs_arm = [ "//third_party/libvpx/source/libvpx/vpx/vpx_image.h", "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/sad.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/variance.c", "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c", "//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h", "//third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c", @@ -1150,6 +1165,7 @@ libvpx_srcs_arm = [ "//third_party/libvpx/source/libvpx/vpx_ports/mem.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops_aligned.h", + "//third_party/libvpx/source/libvpx/vpx_ports/msvc.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h", "//third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c", @@ -1175,15 +1191,13 @@ libvpx_srcs_arm_assembly = [ "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_media.asm", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_media.asm", ] libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vp8/common/alloccommon.c", @@ -1210,7 +1224,6 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/reconintra_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c", "//third_party/libvpx/source/libvpx/vp8/common/blockd.c", @@ -1288,7 +1301,6 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.c", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c", "//third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h", @@ -1343,6 +1355,7 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.c", + "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c", "//third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c", "//third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h", "//third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c", @@ -1519,7 +1532,9 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/sad.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/variance.c", "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c", "//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h", "//third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c", @@ -1530,6 +1545,7 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vpx_ports/mem.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops_aligned.h", + "//third_party/libvpx/source/libvpx/vpx_ports/msvc.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h", "//third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c", @@ -1555,12 +1571,9 @@ libvpx_srcs_arm_neon_assembly = [ "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm", @@ -1582,6 +1595,7 @@ libvpx_srcs_arm_neon_assembly = [ "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_media.asm", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_media.asm", ] libvpx_srcs_arm_neon_cpu_detect = [ "//third_party/libvpx/source/libvpx/vp8/common/alloccommon.c", @@ -1882,6 +1896,7 @@ libvpx_srcs_arm_neon_cpu_detect = [ "//third_party/libvpx/source/libvpx/vpx/vpx_image.h", "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/sad.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/variance.c", "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c", "//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h", "//third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c", @@ -1892,6 +1907,7 @@ libvpx_srcs_arm_neon_cpu_detect = [ "//third_party/libvpx/source/libvpx/vpx_ports/mem.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops_aligned.h", + "//third_party/libvpx/source/libvpx/vpx_ports/msvc.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h", "//third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c", @@ -1917,15 +1933,13 @@ libvpx_srcs_arm_neon_cpu_detect_assembly = [ "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm", - "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_media.asm", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_media.asm", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm", @@ -1962,19 +1976,18 @@ libvpx_srcs_arm_neon_cpu_detect_neon = [ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/reconintra_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/denoising_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.c", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c", "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.c", + "//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c", @@ -1982,6 +1995,7 @@ libvpx_srcs_arm_neon_cpu_detect_neon = [ "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c", ] libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vp8/common/alloccommon.c", @@ -2005,7 +2019,6 @@ libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/reconintra_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/sixtappredict_neon.c", - "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c", "//third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c", "//third_party/libvpx/source/libvpx/vp8/common/blockd.c", @@ -2083,7 +2096,6 @@ libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/subtract_neon.c", - "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c", "//third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c", "//third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h", @@ -2329,7 +2341,9 @@ libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/sad.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/variance.c", "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c", "//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h", "//third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c", @@ -2340,6 +2354,7 @@ libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vpx_ports/mem.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops_aligned.h", + "//third_party/libvpx/source/libvpx/vpx_ports/msvc.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h", "//third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c", @@ -2645,6 +2660,7 @@ libvpx_srcs_mips = [ "//third_party/libvpx/source/libvpx/vpx/vpx_image.h", "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/sad.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/variance.c", "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c", "//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h", "//third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c", @@ -2653,6 +2669,7 @@ libvpx_srcs_mips = [ "//third_party/libvpx/source/libvpx/vpx_ports/mem.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops_aligned.h", + "//third_party/libvpx/source/libvpx/vpx_ports/msvc.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h", "//third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c", @@ -2956,6 +2973,7 @@ libvpx_srcs_nacl = [ "//third_party/libvpx/source/libvpx/vpx/vpx_image.h", "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/sad.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/variance.c", "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c", "//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h", "//third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c", @@ -2964,6 +2982,7 @@ libvpx_srcs_nacl = [ "//third_party/libvpx/source/libvpx/vpx_ports/mem.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops_aligned.h", + "//third_party/libvpx/source/libvpx/vpx_ports/msvc.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h", "//third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c", @@ -3267,6 +3286,7 @@ libvpx_srcs_generic = [ "//third_party/libvpx/source/libvpx/vpx/vpx_image.h", "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/sad.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/variance.c", "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c", "//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h", "//third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c", @@ -3275,6 +3295,7 @@ libvpx_srcs_generic = [ "//third_party/libvpx/source/libvpx/vpx_ports/mem.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops.h", "//third_party/libvpx/source/libvpx/vpx_ports/mem_ops_aligned.h", + "//third_party/libvpx/source/libvpx/vpx_ports/msvc.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h", "//third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h", "//third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c", diff --git a/chromium/third_party/libvpx/libvpx_srcs_arm.gypi b/chromium/third_party/libvpx/libvpx_srcs_arm.gypi index bd6d32b07b7..1f457e299f6 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_arm.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_arm.gypi @@ -22,8 +22,6 @@ '<(libvpx_source)/vp8/common/arm/armv6/loopfilter_v6.asm', '<(libvpx_source)/vp8/common/arm/armv6/simpleloopfilter_v6.asm', '<(libvpx_source)/vp8/common/arm/armv6/sixtappredict8x4_v6.asm', - '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm', - '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm', '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm', '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm', '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm', @@ -103,7 +101,6 @@ '<(libvpx_source)/vp8/decoder/onyxd_int.h', '<(libvpx_source)/vp8/decoder/threading.c', '<(libvpx_source)/vp8/decoder/treereader.h', - '<(libvpx_source)/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm', '<(libvpx_source)/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm', '<(libvpx_source)/vp8/encoder/arm/armv6/walsh_v6.asm', '<(libvpx_source)/vp8/encoder/arm/dct_arm.c', @@ -325,7 +322,9 @@ '<(libvpx_source)/vpx/vpx_image.h', '<(libvpx_source)/vpx/vpx_integer.h', '<(libvpx_source)/vpx_dsp/arm/sad_media.asm', + '<(libvpx_source)/vpx_dsp/arm/variance_media.asm', '<(libvpx_source)/vpx_dsp/sad.c', + '<(libvpx_source)/vpx_dsp/variance.c', '<(libvpx_source)/vpx_dsp/vpx_dsp_rtcd.c', '<(libvpx_source)/vpx_mem/include/vpx_mem_intrnl.h', '<(libvpx_source)/vpx_mem/vpx_mem.c', @@ -336,6 +335,7 @@ '<(libvpx_source)/vpx_ports/mem.h', '<(libvpx_source)/vpx_ports/mem_ops.h', '<(libvpx_source)/vpx_ports/mem_ops_aligned.h', + '<(libvpx_source)/vpx_ports/msvc.h', '<(libvpx_source)/vpx_ports/vpx_once.h', '<(libvpx_source)/vpx_ports/vpx_timer.h', '<(libvpx_source)/vpx_scale/generic/gen_scalers.c', diff --git a/chromium/third_party/libvpx/libvpx_srcs_arm64.gypi b/chromium/third_party/libvpx/libvpx_srcs_arm64.gypi index 31e6fca63f0..5b4acd877ea 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_arm64.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_arm64.gypi @@ -26,7 +26,6 @@ '<(libvpx_source)/vp8/common/arm/neon/reconintra_neon.c', '<(libvpx_source)/vp8/common/arm/neon/shortidct4x4llm_neon.c', '<(libvpx_source)/vp8/common/arm/neon/sixtappredict_neon.c', - '<(libvpx_source)/vp8/common/arm/neon/variance_neon.c', '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance_neon.c', '<(libvpx_source)/vp8/common/arm/variance_arm.c', '<(libvpx_source)/vp8/common/blockd.c', @@ -104,7 +103,6 @@ '<(libvpx_source)/vp8/encoder/arm/neon/fastquantizeb_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/subtract_neon.c', - '<(libvpx_source)/vp8/encoder/arm/neon/vp8_mse16x16_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c', '<(libvpx_source)/vp8/encoder/bitstream.c', '<(libvpx_source)/vp8/encoder/bitstream.h', @@ -350,7 +348,9 @@ '<(libvpx_source)/vpx/vpx_integer.h', '<(libvpx_source)/vpx_dsp/arm/sad4d_neon.c', '<(libvpx_source)/vpx_dsp/arm/sad_neon.c', + '<(libvpx_source)/vpx_dsp/arm/variance_neon.c', '<(libvpx_source)/vpx_dsp/sad.c', + '<(libvpx_source)/vpx_dsp/variance.c', '<(libvpx_source)/vpx_dsp/vpx_dsp_rtcd.c', '<(libvpx_source)/vpx_mem/include/vpx_mem_intrnl.h', '<(libvpx_source)/vpx_mem/vpx_mem.c', @@ -361,6 +361,7 @@ '<(libvpx_source)/vpx_ports/mem.h', '<(libvpx_source)/vpx_ports/mem_ops.h', '<(libvpx_source)/vpx_ports/mem_ops_aligned.h', + '<(libvpx_source)/vpx_ports/msvc.h', '<(libvpx_source)/vpx_ports/vpx_once.h', '<(libvpx_source)/vpx_ports/vpx_timer.h', '<(libvpx_source)/vpx_scale/generic/gen_scalers.c', diff --git a/chromium/third_party/libvpx/libvpx_srcs_arm_neon.gypi b/chromium/third_party/libvpx/libvpx_srcs_arm_neon.gypi index 277180640de..81e03e049b2 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_arm_neon.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_arm_neon.gypi @@ -22,8 +22,6 @@ '<(libvpx_source)/vp8/common/arm/armv6/loopfilter_v6.asm', '<(libvpx_source)/vp8/common/arm/armv6/simpleloopfilter_v6.asm', '<(libvpx_source)/vp8/common/arm/armv6/sixtappredict8x4_v6.asm', - '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm', - '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm', '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm', '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm', '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm', @@ -48,7 +46,6 @@ '<(libvpx_source)/vp8/common/arm/neon/reconintra_neon.c', '<(libvpx_source)/vp8/common/arm/neon/shortidct4x4llm_neon.c', '<(libvpx_source)/vp8/common/arm/neon/sixtappredict_neon.c', - '<(libvpx_source)/vp8/common/arm/neon/variance_neon.c', '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance_neon.c', '<(libvpx_source)/vp8/common/arm/variance_arm.c', '<(libvpx_source)/vp8/common/blockd.c', @@ -121,7 +118,6 @@ '<(libvpx_source)/vp8/decoder/onyxd_int.h', '<(libvpx_source)/vp8/decoder/threading.c', '<(libvpx_source)/vp8/decoder/treereader.h', - '<(libvpx_source)/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm', '<(libvpx_source)/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm', '<(libvpx_source)/vp8/encoder/arm/armv6/walsh_v6.asm', '<(libvpx_source)/vp8/encoder/arm/dct_arm.c', @@ -129,7 +125,6 @@ '<(libvpx_source)/vp8/encoder/arm/neon/fastquantizeb_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/subtract_neon.c', - '<(libvpx_source)/vp8/encoder/arm/neon/vp8_mse16x16_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c', '<(libvpx_source)/vp8/encoder/bitstream.c', '<(libvpx_source)/vp8/encoder/bitstream.h', @@ -200,6 +195,7 @@ '<(libvpx_source)/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm', '<(libvpx_source)/vp9/common/arm/neon/vp9_loopfilter_neon.c', '<(libvpx_source)/vp9/common/arm/neon/vp9_mb_lpf_neon.asm', + '<(libvpx_source)/vp9/common/arm/neon/vp9_reconintra_neon.c', '<(libvpx_source)/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm', '<(libvpx_source)/vp9/common/arm/neon/vp9_save_reg_neon.asm', '<(libvpx_source)/vp9/common/vp9_alloccommon.c', @@ -379,7 +375,10 @@ '<(libvpx_source)/vpx_dsp/arm/sad4d_neon.c', '<(libvpx_source)/vpx_dsp/arm/sad_media.asm', '<(libvpx_source)/vpx_dsp/arm/sad_neon.c', + '<(libvpx_source)/vpx_dsp/arm/variance_media.asm', + '<(libvpx_source)/vpx_dsp/arm/variance_neon.c', '<(libvpx_source)/vpx_dsp/sad.c', + '<(libvpx_source)/vpx_dsp/variance.c', '<(libvpx_source)/vpx_dsp/vpx_dsp_rtcd.c', '<(libvpx_source)/vpx_mem/include/vpx_mem_intrnl.h', '<(libvpx_source)/vpx_mem/vpx_mem.c', @@ -390,6 +389,7 @@ '<(libvpx_source)/vpx_ports/mem.h', '<(libvpx_source)/vpx_ports/mem_ops.h', '<(libvpx_source)/vpx_ports/mem_ops_aligned.h', + '<(libvpx_source)/vpx_ports/msvc.h', '<(libvpx_source)/vpx_ports/vpx_once.h', '<(libvpx_source)/vpx_ports/vpx_timer.h', '<(libvpx_source)/vpx_scale/generic/gen_scalers.c', diff --git a/chromium/third_party/libvpx/libvpx_srcs_arm_neon_cpu_detect.gypi b/chromium/third_party/libvpx/libvpx_srcs_arm_neon_cpu_detect.gypi index bd6d32b07b7..1f457e299f6 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_arm_neon_cpu_detect.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_arm_neon_cpu_detect.gypi @@ -22,8 +22,6 @@ '<(libvpx_source)/vp8/common/arm/armv6/loopfilter_v6.asm', '<(libvpx_source)/vp8/common/arm/armv6/simpleloopfilter_v6.asm', '<(libvpx_source)/vp8/common/arm/armv6/sixtappredict8x4_v6.asm', - '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm', - '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm', '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm', '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm', '<(libvpx_source)/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm', @@ -103,7 +101,6 @@ '<(libvpx_source)/vp8/decoder/onyxd_int.h', '<(libvpx_source)/vp8/decoder/threading.c', '<(libvpx_source)/vp8/decoder/treereader.h', - '<(libvpx_source)/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm', '<(libvpx_source)/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm', '<(libvpx_source)/vp8/encoder/arm/armv6/walsh_v6.asm', '<(libvpx_source)/vp8/encoder/arm/dct_arm.c', @@ -325,7 +322,9 @@ '<(libvpx_source)/vpx/vpx_image.h', '<(libvpx_source)/vpx/vpx_integer.h', '<(libvpx_source)/vpx_dsp/arm/sad_media.asm', + '<(libvpx_source)/vpx_dsp/arm/variance_media.asm', '<(libvpx_source)/vpx_dsp/sad.c', + '<(libvpx_source)/vpx_dsp/variance.c', '<(libvpx_source)/vpx_dsp/vpx_dsp_rtcd.c', '<(libvpx_source)/vpx_mem/include/vpx_mem_intrnl.h', '<(libvpx_source)/vpx_mem/vpx_mem.c', @@ -336,6 +335,7 @@ '<(libvpx_source)/vpx_ports/mem.h', '<(libvpx_source)/vpx_ports/mem_ops.h', '<(libvpx_source)/vpx_ports/mem_ops_aligned.h', + '<(libvpx_source)/vpx_ports/msvc.h', '<(libvpx_source)/vpx_ports/vpx_once.h', '<(libvpx_source)/vpx_ports/vpx_timer.h', '<(libvpx_source)/vpx_scale/generic/gen_scalers.c', diff --git a/chromium/third_party/libvpx/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi b/chromium/third_party/libvpx/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi index 6b60be6cfe0..b60479e37c5 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_arm_neon_cpu_detect_intrinsics.gypi @@ -29,13 +29,11 @@ '<(libvpx_source)/vp8/common/arm/neon/reconintra_neon.c', '<(libvpx_source)/vp8/common/arm/neon/shortidct4x4llm_neon.c', '<(libvpx_source)/vp8/common/arm/neon/sixtappredict_neon.c', - '<(libvpx_source)/vp8/common/arm/neon/variance_neon.c', '<(libvpx_source)/vp8/common/arm/neon/vp8_subpixelvariance_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/denoising_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/fastquantizeb_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/shortfdct_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/subtract_neon.c', - '<(libvpx_source)/vp8/encoder/arm/neon/vp8_mse16x16_neon.c', '<(libvpx_source)/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c', '<(libvpx_source)/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm', '<(libvpx_source)/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm', @@ -58,6 +56,7 @@ '<(libvpx_source)/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm', '<(libvpx_source)/vp9/common/arm/neon/vp9_loopfilter_neon.c', '<(libvpx_source)/vp9/common/arm/neon/vp9_mb_lpf_neon.asm', + '<(libvpx_source)/vp9/common/arm/neon/vp9_reconintra_neon.c', '<(libvpx_source)/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm', '<(libvpx_source)/vp9/common/arm/neon/vp9_save_reg_neon.asm', '<(libvpx_source)/vp9/encoder/arm/neon/vp9_avg_neon.c', @@ -67,6 +66,7 @@ '<(libvpx_source)/vp9/encoder/arm/neon/vp9_variance_neon.c', '<(libvpx_source)/vpx_dsp/arm/sad4d_neon.c', '<(libvpx_source)/vpx_dsp/arm/sad_neon.c', + '<(libvpx_source)/vpx_dsp/arm/variance_neon.c', ], 'includes': [ 'ads2gas.gypi' ], 'cflags!': [ '-mfpu=vfpv3-d16' ], diff --git a/chromium/third_party/libvpx/libvpx_srcs_generic.gypi b/chromium/third_party/libvpx/libvpx_srcs_generic.gypi index eb808cc1436..71a5c4adcb8 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_generic.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_generic.gypi @@ -295,6 +295,7 @@ '<(libvpx_source)/vpx/vpx_image.h', '<(libvpx_source)/vpx/vpx_integer.h', '<(libvpx_source)/vpx_dsp/sad.c', + '<(libvpx_source)/vpx_dsp/variance.c', '<(libvpx_source)/vpx_dsp/vpx_dsp_rtcd.c', '<(libvpx_source)/vpx_mem/include/vpx_mem_intrnl.h', '<(libvpx_source)/vpx_mem/vpx_mem.c', @@ -303,6 +304,7 @@ '<(libvpx_source)/vpx_ports/mem.h', '<(libvpx_source)/vpx_ports/mem_ops.h', '<(libvpx_source)/vpx_ports/mem_ops_aligned.h', + '<(libvpx_source)/vpx_ports/msvc.h', '<(libvpx_source)/vpx_ports/vpx_once.h', '<(libvpx_source)/vpx_ports/vpx_timer.h', '<(libvpx_source)/vpx_scale/generic/gen_scalers.c', diff --git a/chromium/third_party/libvpx/libvpx_srcs_mips.gypi b/chromium/third_party/libvpx/libvpx_srcs_mips.gypi index 8abf1082a11..df3f66fa791 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_mips.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_mips.gypi @@ -297,6 +297,7 @@ '<(libvpx_source)/vpx/vpx_image.h', '<(libvpx_source)/vpx/vpx_integer.h', '<(libvpx_source)/vpx_dsp/sad.c', + '<(libvpx_source)/vpx_dsp/variance.c', '<(libvpx_source)/vpx_dsp/vpx_dsp_rtcd.c', '<(libvpx_source)/vpx_mem/include/vpx_mem_intrnl.h', '<(libvpx_source)/vpx_mem/vpx_mem.c', @@ -305,6 +306,7 @@ '<(libvpx_source)/vpx_ports/mem.h', '<(libvpx_source)/vpx_ports/mem_ops.h', '<(libvpx_source)/vpx_ports/mem_ops_aligned.h', + '<(libvpx_source)/vpx_ports/msvc.h', '<(libvpx_source)/vpx_ports/vpx_once.h', '<(libvpx_source)/vpx_ports/vpx_timer.h', '<(libvpx_source)/vpx_scale/generic/gen_scalers.c', diff --git a/chromium/third_party/libvpx/libvpx_srcs_nacl.gypi b/chromium/third_party/libvpx/libvpx_srcs_nacl.gypi index eb808cc1436..71a5c4adcb8 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_nacl.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_nacl.gypi @@ -295,6 +295,7 @@ '<(libvpx_source)/vpx/vpx_image.h', '<(libvpx_source)/vpx/vpx_integer.h', '<(libvpx_source)/vpx_dsp/sad.c', + '<(libvpx_source)/vpx_dsp/variance.c', '<(libvpx_source)/vpx_dsp/vpx_dsp_rtcd.c', '<(libvpx_source)/vpx_mem/include/vpx_mem_intrnl.h', '<(libvpx_source)/vpx_mem/vpx_mem.c', @@ -303,6 +304,7 @@ '<(libvpx_source)/vpx_ports/mem.h', '<(libvpx_source)/vpx_ports/mem_ops.h', '<(libvpx_source)/vpx_ports/mem_ops_aligned.h', + '<(libvpx_source)/vpx_ports/msvc.h', '<(libvpx_source)/vpx_ports/vpx_once.h', '<(libvpx_source)/vpx_ports/vpx_timer.h', '<(libvpx_source)/vpx_scale/generic/gen_scalers.c', diff --git a/chromium/third_party/libvpx/libvpx_srcs_x86.gypi b/chromium/third_party/libvpx/libvpx_srcs_x86.gypi index d61c186aca5..63f05e35f01 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_x86.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_x86.gypi @@ -85,10 +85,10 @@ '<(libvpx_source)/vp8/common/x86/subpixel_mmx.asm', '<(libvpx_source)/vp8/common/x86/subpixel_sse2.asm', '<(libvpx_source)/vp8/common/x86/subpixel_ssse3.asm', - '<(libvpx_source)/vp8/common/x86/variance_impl_mmx.asm', '<(libvpx_source)/vp8/common/x86/variance_impl_sse2.asm', '<(libvpx_source)/vp8/common/x86/variance_impl_ssse3.asm', '<(libvpx_source)/vp8/common/x86/vp8_asm_stubs.c', + '<(libvpx_source)/vp8/common/x86/vp8_variance_impl_mmx.asm', '<(libvpx_source)/vp8/decoder/dboolhuff.c', '<(libvpx_source)/vp8/decoder/dboolhuff.h', '<(libvpx_source)/vp8/decoder/decodeframe.c', @@ -216,6 +216,7 @@ '<(libvpx_source)/vp9/common/vp9_thread_common.h', '<(libvpx_source)/vp9/common/vp9_tile_common.c', '<(libvpx_source)/vp9/common/vp9_tile_common.h', + '<(libvpx_source)/vp9/common/x86/convolve.h', '<(libvpx_source)/vp9/common/x86/vp9_asm_stubs.c', '<(libvpx_source)/vp9/common/x86/vp9_copy_sse2.asm', '<(libvpx_source)/vp9/common/x86/vp9_idct_intrin_sse2.h', @@ -318,8 +319,11 @@ '<(libvpx_source)/vp9/encoder/vp9_write_bit_buffer.h', '<(libvpx_source)/vp9/encoder/vp9_writer.c', '<(libvpx_source)/vp9/encoder/vp9_writer.h', + '<(libvpx_source)/vp9/encoder/x86/vp9_dct32x32_avx2_impl.h', + '<(libvpx_source)/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h', '<(libvpx_source)/vp9/encoder/x86/vp9_dct_mmx.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_dct_sse2.h', + '<(libvpx_source)/vp9/encoder/x86/vp9_dct_sse2_impl.h', '<(libvpx_source)/vp9/encoder/x86/vp9_error_sse2.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_subpel_variance.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_subtract_sse2.asm', @@ -344,6 +348,7 @@ '<(libvpx_source)/vpx/vpx_image.h', '<(libvpx_source)/vpx/vpx_integer.h', '<(libvpx_source)/vpx_dsp/sad.c', + '<(libvpx_source)/vpx_dsp/variance.c', '<(libvpx_source)/vpx_dsp/vpx_dsp_rtcd.c', '<(libvpx_source)/vpx_dsp/x86/sad4d_sse2.asm', '<(libvpx_source)/vpx_dsp/x86/sad_mmx.asm', @@ -351,6 +356,7 @@ '<(libvpx_source)/vpx_dsp/x86/sad_sse3.asm', '<(libvpx_source)/vpx_dsp/x86/sad_sse4.asm', '<(libvpx_source)/vpx_dsp/x86/sad_ssse3.asm', + '<(libvpx_source)/vpx_dsp/x86/variance_impl_mmx.asm', '<(libvpx_source)/vpx_mem/include/vpx_mem_intrnl.h', '<(libvpx_source)/vpx_mem/vpx_mem.c', '<(libvpx_source)/vpx_mem/vpx_mem.h', @@ -359,6 +365,7 @@ '<(libvpx_source)/vpx_ports/mem.h', '<(libvpx_source)/vpx_ports/mem_ops.h', '<(libvpx_source)/vpx_ports/mem_ops_aligned.h', + '<(libvpx_source)/vpx_ports/msvc.h', '<(libvpx_source)/vpx_ports/vpx_once.h', '<(libvpx_source)/vpx_ports/vpx_timer.h', '<(libvpx_source)/vpx_ports/x86.h', diff --git a/chromium/third_party/libvpx/libvpx_srcs_x86_64.gypi b/chromium/third_party/libvpx/libvpx_srcs_x86_64.gypi index c99ebee722a..5072d33fa0c 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_x86_64.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_x86_64.gypi @@ -86,10 +86,10 @@ '<(libvpx_source)/vp8/common/x86/subpixel_mmx.asm', '<(libvpx_source)/vp8/common/x86/subpixel_sse2.asm', '<(libvpx_source)/vp8/common/x86/subpixel_ssse3.asm', - '<(libvpx_source)/vp8/common/x86/variance_impl_mmx.asm', '<(libvpx_source)/vp8/common/x86/variance_impl_sse2.asm', '<(libvpx_source)/vp8/common/x86/variance_impl_ssse3.asm', '<(libvpx_source)/vp8/common/x86/vp8_asm_stubs.c', + '<(libvpx_source)/vp8/common/x86/vp8_variance_impl_mmx.asm', '<(libvpx_source)/vp8/decoder/dboolhuff.c', '<(libvpx_source)/vp8/decoder/dboolhuff.h', '<(libvpx_source)/vp8/decoder/decodeframe.c', @@ -218,6 +218,7 @@ '<(libvpx_source)/vp9/common/vp9_thread_common.h', '<(libvpx_source)/vp9/common/vp9_tile_common.c', '<(libvpx_source)/vp9/common/vp9_tile_common.h', + '<(libvpx_source)/vp9/common/x86/convolve.h', '<(libvpx_source)/vp9/common/x86/vp9_asm_stubs.c', '<(libvpx_source)/vp9/common/x86/vp9_copy_sse2.asm', '<(libvpx_source)/vp9/common/x86/vp9_idct_intrin_sse2.h', @@ -321,8 +322,11 @@ '<(libvpx_source)/vp9/encoder/vp9_write_bit_buffer.h', '<(libvpx_source)/vp9/encoder/vp9_writer.c', '<(libvpx_source)/vp9/encoder/vp9_writer.h', + '<(libvpx_source)/vp9/encoder/x86/vp9_dct32x32_avx2_impl.h', + '<(libvpx_source)/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h', '<(libvpx_source)/vp9/encoder/x86/vp9_dct_mmx.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_dct_sse2.h', + '<(libvpx_source)/vp9/encoder/x86/vp9_dct_sse2_impl.h', '<(libvpx_source)/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_error_sse2.asm', '<(libvpx_source)/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm', @@ -350,6 +354,7 @@ '<(libvpx_source)/vpx/vpx_image.h', '<(libvpx_source)/vpx/vpx_integer.h', '<(libvpx_source)/vpx_dsp/sad.c', + '<(libvpx_source)/vpx_dsp/variance.c', '<(libvpx_source)/vpx_dsp/vpx_dsp_rtcd.c', '<(libvpx_source)/vpx_dsp/x86/sad4d_sse2.asm', '<(libvpx_source)/vpx_dsp/x86/sad_mmx.asm', @@ -357,6 +362,7 @@ '<(libvpx_source)/vpx_dsp/x86/sad_sse3.asm', '<(libvpx_source)/vpx_dsp/x86/sad_sse4.asm', '<(libvpx_source)/vpx_dsp/x86/sad_ssse3.asm', + '<(libvpx_source)/vpx_dsp/x86/variance_impl_mmx.asm', '<(libvpx_source)/vpx_mem/include/vpx_mem_intrnl.h', '<(libvpx_source)/vpx_mem/vpx_mem.c', '<(libvpx_source)/vpx_mem/vpx_mem.h', @@ -365,6 +371,7 @@ '<(libvpx_source)/vpx_ports/mem.h', '<(libvpx_source)/vpx_ports/mem_ops.h', '<(libvpx_source)/vpx_ports/mem_ops_aligned.h', + '<(libvpx_source)/vpx_ports/msvc.h', '<(libvpx_source)/vpx_ports/vpx_once.h', '<(libvpx_source)/vpx_ports/vpx_timer.h', '<(libvpx_source)/vpx_ports/x86.h', diff --git a/chromium/third_party/libvpx/libvpx_srcs_x86_64_intrinsics.gypi b/chromium/third_party/libvpx/libvpx_srcs_x86_64_intrinsics.gypi index ecf9fb7e810..f8590f576ac 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_x86_64_intrinsics.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_x86_64_intrinsics.gypi @@ -14,8 +14,9 @@ ], 'sources': [ '<(libvpx_source)/vp8/common/x86/idct_blk_mmx.c', - '<(libvpx_source)/vp8/common/x86/variance_mmx.c', + '<(libvpx_source)/vp8/common/x86/vp8_variance_mmx.c', '<(libvpx_source)/vp8/encoder/x86/vp8_enc_stubs_mmx.c', + '<(libvpx_source)/vpx_dsp/x86/variance_mmx.c', ], 'cflags': [ '-mmmx', ], 'xcode_settings': { 'OTHER_CFLAGS': [ '-mmmx' ] }, @@ -30,19 +31,18 @@ 'sources': [ '<(libvpx_source)/vp8/common/x86/idct_blk_sse2.c', '<(libvpx_source)/vp8/common/x86/recon_wrapper_sse2.c', - '<(libvpx_source)/vp8/common/x86/variance_sse2.c', + '<(libvpx_source)/vp8/common/x86/vp8_variance_sse2.c', '<(libvpx_source)/vp8/encoder/x86/denoising_sse2.c', '<(libvpx_source)/vp8/encoder/x86/quantize_sse2.c', '<(libvpx_source)/vp8/encoder/x86/vp8_enc_stubs_sse2.c', '<(libvpx_source)/vp9/common/x86/vp9_idct_intrin_sse2.c', '<(libvpx_source)/vp9/common/x86/vp9_loopfilter_intrin_sse2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_avg_intrin_sse2.c', - '<(libvpx_source)/vp9/encoder/x86/vp9_dct32x32_sse2.c', - '<(libvpx_source)/vp9/encoder/x86/vp9_dct_impl_sse2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_dct_sse2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_denoiser_sse2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_quantize_sse2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_variance_sse2.c', + '<(libvpx_source)/vpx_dsp/x86/variance_sse2.c', ], 'cflags': [ '-msse2', ], 'xcode_settings': { 'OTHER_CFLAGS': [ '-msse2' ] }, @@ -104,14 +104,14 @@ 'sources': [ '<(libvpx_source)/vp9/common/x86/vp9_loopfilter_intrin_avx2.c', '<(libvpx_source)/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c', - '<(libvpx_source)/vp9/encoder/x86/vp9_dct32x32_avx2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_dct_avx2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_error_intrin_avx2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_variance_avx2.c', - '<(libvpx_source)/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c', '<(libvpx_source)/vpx_dsp/x86/sad4d_avx2.c', '<(libvpx_source)/vpx_dsp/x86/sad_avx2.c', + '<(libvpx_source)/vpx_dsp/x86/variance_avx2.c', + '<(libvpx_source)/vpx_dsp/x86/variance_impl_avx2.c', ], 'cflags': [ '-mavx2', ], 'xcode_settings': { 'OTHER_CFLAGS': [ '-mavx2' ] }, diff --git a/chromium/third_party/libvpx/libvpx_srcs_x86_intrinsics.gypi b/chromium/third_party/libvpx/libvpx_srcs_x86_intrinsics.gypi index ecf9fb7e810..f8590f576ac 100644 --- a/chromium/third_party/libvpx/libvpx_srcs_x86_intrinsics.gypi +++ b/chromium/third_party/libvpx/libvpx_srcs_x86_intrinsics.gypi @@ -14,8 +14,9 @@ ], 'sources': [ '<(libvpx_source)/vp8/common/x86/idct_blk_mmx.c', - '<(libvpx_source)/vp8/common/x86/variance_mmx.c', + '<(libvpx_source)/vp8/common/x86/vp8_variance_mmx.c', '<(libvpx_source)/vp8/encoder/x86/vp8_enc_stubs_mmx.c', + '<(libvpx_source)/vpx_dsp/x86/variance_mmx.c', ], 'cflags': [ '-mmmx', ], 'xcode_settings': { 'OTHER_CFLAGS': [ '-mmmx' ] }, @@ -30,19 +31,18 @@ 'sources': [ '<(libvpx_source)/vp8/common/x86/idct_blk_sse2.c', '<(libvpx_source)/vp8/common/x86/recon_wrapper_sse2.c', - '<(libvpx_source)/vp8/common/x86/variance_sse2.c', + '<(libvpx_source)/vp8/common/x86/vp8_variance_sse2.c', '<(libvpx_source)/vp8/encoder/x86/denoising_sse2.c', '<(libvpx_source)/vp8/encoder/x86/quantize_sse2.c', '<(libvpx_source)/vp8/encoder/x86/vp8_enc_stubs_sse2.c', '<(libvpx_source)/vp9/common/x86/vp9_idct_intrin_sse2.c', '<(libvpx_source)/vp9/common/x86/vp9_loopfilter_intrin_sse2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_avg_intrin_sse2.c', - '<(libvpx_source)/vp9/encoder/x86/vp9_dct32x32_sse2.c', - '<(libvpx_source)/vp9/encoder/x86/vp9_dct_impl_sse2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_dct_sse2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_denoiser_sse2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_quantize_sse2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_variance_sse2.c', + '<(libvpx_source)/vpx_dsp/x86/variance_sse2.c', ], 'cflags': [ '-msse2', ], 'xcode_settings': { 'OTHER_CFLAGS': [ '-msse2' ] }, @@ -104,14 +104,14 @@ 'sources': [ '<(libvpx_source)/vp9/common/x86/vp9_loopfilter_intrin_avx2.c', '<(libvpx_source)/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c', - '<(libvpx_source)/vp9/encoder/x86/vp9_dct32x32_avx2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_dct_avx2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_error_intrin_avx2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c', '<(libvpx_source)/vp9/encoder/x86/vp9_variance_avx2.c', - '<(libvpx_source)/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c', '<(libvpx_source)/vpx_dsp/x86/sad4d_avx2.c', '<(libvpx_source)/vpx_dsp/x86/sad_avx2.c', + '<(libvpx_source)/vpx_dsp/x86/variance_avx2.c', + '<(libvpx_source)/vpx_dsp/x86/variance_impl_avx2.c', ], 'cflags': [ '-mavx2', ], 'xcode_settings': { 'OTHER_CFLAGS': [ '-mavx2' ] }, diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h index 21caf9804bc..16cd808528c 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp8_rtcd.h @@ -33,8 +33,7 @@ RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); void vp8_bilinear_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_armv6 void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); void vp8_bilinear_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); @@ -136,13 +135,6 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_full_search_sad vp8_full_search_sad_c -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); - -unsigned int vp8_get_mb_ss_c(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_c - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_armv6 @@ -199,11 +191,6 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,in int vp8_mbuverror_c(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_c -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_c @@ -263,9 +250,6 @@ void vp8_sixtap_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, i void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_neon(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); @@ -282,8 +266,7 @@ unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int so unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance8x8_armv6(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_neon(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_armv6 void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_neon(struct block *be, struct blockd *bd, int pitch); @@ -297,27 +280,6 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigne void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_c - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); @@ -347,8 +309,6 @@ static void setup_rtcd_internal(void) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_armv6; if (flags & HAS_NEON) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_neon; - vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_armv6; - if (flags & HAS_NEON) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_neon; vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_armv6; if (flags & HAS_NEON) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_neon; vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_armv6; @@ -379,8 +339,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vp8_dequantize_b = vp8_dequantize_b_neon; vp8_fast_quantize_b = vp8_fast_quantize_b_c; if (flags & HAS_NEON) vp8_fast_quantize_b = vp8_fast_quantize_b_neon; - vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; - if (flags & HAS_NEON) vp8_get4x4sse_cs = vp8_get4x4sse_cs_neon; vp8_loop_filter_bh = vp8_loop_filter_bh_armv6; if (flags & HAS_NEON) vp8_loop_filter_bh = vp8_loop_filter_bh_neon; vp8_loop_filter_bv = vp8_loop_filter_bv_armv6; @@ -397,8 +355,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vp8_loop_filter_simple_mbh = vp8_loop_filter_mbhs_neon; vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_armv6; if (flags & HAS_NEON) vp8_loop_filter_simple_mbv = vp8_loop_filter_mbvs_neon; - vp8_mse16x16 = vp8_mse16x16_armv6; - if (flags & HAS_NEON) vp8_mse16x16 = vp8_mse16x16_neon; vp8_short_fdct4x4 = vp8_short_fdct4x4_armv6; if (flags & HAS_NEON) vp8_short_fdct4x4 = vp8_short_fdct4x4_neon; vp8_short_fdct8x4 = vp8_short_fdct8x4_armv6; @@ -417,22 +373,12 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_neon; vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_armv6; if (flags & HAS_NEON) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_neon; - vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_armv6; - if (flags & HAS_NEON) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_neon; vp8_subtract_b = vp8_subtract_b_c; if (flags & HAS_NEON) vp8_subtract_b = vp8_subtract_b_neon; vp8_subtract_mbuv = vp8_subtract_mbuv_c; if (flags & HAS_NEON) vp8_subtract_mbuv = vp8_subtract_mbuv_neon; vp8_subtract_mby = vp8_subtract_mby_c; if (flags & HAS_NEON) vp8_subtract_mby = vp8_subtract_mby_neon; - vp8_variance16x16 = vp8_variance16x16_armv6; - if (flags & HAS_NEON) vp8_variance16x16 = vp8_variance16x16_neon; - vp8_variance16x8 = vp8_variance16x8_c; - if (flags & HAS_NEON) vp8_variance16x8 = vp8_variance16x8_neon; - vp8_variance8x16 = vp8_variance8x16_c; - if (flags & HAS_NEON) vp8_variance8x16 = vp8_variance8x16_neon; - vp8_variance8x8 = vp8_variance8x8_armv6; - if (flags & HAS_NEON) vp8_variance8x8 = vp8_variance8x8_neon; vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_armv6; if (flags & HAS_NEON) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_neon; vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_armv6; diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h index eefc3fd7a47..a4108d1f790 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h @@ -93,7 +93,8 @@ void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c +void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d135_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c @@ -123,16 +124,19 @@ void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a #define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c +void vp9_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c +void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c +void vp9_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c @@ -147,52 +151,68 @@ void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c +void vp9_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c +void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c +void vp9_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c +void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c +void vp9_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c +void vp9_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c +void vp9_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c +void vp9_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c +void vp9_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c +void vp9_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c +void vp9_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c +void vp9_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c +void vp9_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c +void vp9_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c +void vp9_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c +void vp9_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); #define vp9_denoiser_filter vp9_denoiser_filter_c @@ -257,17 +277,6 @@ int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, i void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -unsigned int vp9_get_mb_ss_c(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_c - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -410,18 +419,6 @@ void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, int rows, int cols, int fli void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vp9_minmax_8x8 vp9_minmax_8x8_c -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x16 vp9_mse16x16_c - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); #define vp9_plane_add_noise vp9_plane_add_noise_c @@ -565,51 +562,6 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_c - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); #define vp9_vector_var vp9_vector_var_c @@ -643,16 +595,52 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vp9_convolve_avg = vp9_convolve_avg_neon; vp9_convolve_copy = vp9_convolve_copy_c; if (flags & HAS_NEON) vp9_convolve_copy = vp9_convolve_copy_neon; + vp9_d135_predictor_4x4 = vp9_d135_predictor_4x4_c; + if (flags & HAS_NEON) vp9_d135_predictor_4x4 = vp9_d135_predictor_4x4_neon; + vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; + if (flags & HAS_NEON) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_neon; + vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; + if (flags & HAS_NEON) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_neon; + vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; + if (flags & HAS_NEON) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_neon; + vp9_dc_128_predictor_16x16 = vp9_dc_128_predictor_16x16_c; + if (flags & HAS_NEON) vp9_dc_128_predictor_16x16 = vp9_dc_128_predictor_16x16_neon; + vp9_dc_128_predictor_32x32 = vp9_dc_128_predictor_32x32_c; + if (flags & HAS_NEON) vp9_dc_128_predictor_32x32 = vp9_dc_128_predictor_32x32_neon; + vp9_dc_128_predictor_4x4 = vp9_dc_128_predictor_4x4_c; + if (flags & HAS_NEON) vp9_dc_128_predictor_4x4 = vp9_dc_128_predictor_4x4_neon; + vp9_dc_128_predictor_8x8 = vp9_dc_128_predictor_8x8_c; + if (flags & HAS_NEON) vp9_dc_128_predictor_8x8 = vp9_dc_128_predictor_8x8_neon; + vp9_dc_left_predictor_16x16 = vp9_dc_left_predictor_16x16_c; + if (flags & HAS_NEON) vp9_dc_left_predictor_16x16 = vp9_dc_left_predictor_16x16_neon; + vp9_dc_left_predictor_32x32 = vp9_dc_left_predictor_32x32_c; + if (flags & HAS_NEON) vp9_dc_left_predictor_32x32 = vp9_dc_left_predictor_32x32_neon; + vp9_dc_left_predictor_4x4 = vp9_dc_left_predictor_4x4_c; + if (flags & HAS_NEON) vp9_dc_left_predictor_4x4 = vp9_dc_left_predictor_4x4_neon; + vp9_dc_left_predictor_8x8 = vp9_dc_left_predictor_8x8_c; + if (flags & HAS_NEON) vp9_dc_left_predictor_8x8 = vp9_dc_left_predictor_8x8_neon; + vp9_dc_predictor_16x16 = vp9_dc_predictor_16x16_c; + if (flags & HAS_NEON) vp9_dc_predictor_16x16 = vp9_dc_predictor_16x16_neon; + vp9_dc_predictor_32x32 = vp9_dc_predictor_32x32_c; + if (flags & HAS_NEON) vp9_dc_predictor_32x32 = vp9_dc_predictor_32x32_neon; + vp9_dc_predictor_4x4 = vp9_dc_predictor_4x4_c; + if (flags & HAS_NEON) vp9_dc_predictor_4x4 = vp9_dc_predictor_4x4_neon; + vp9_dc_predictor_8x8 = vp9_dc_predictor_8x8_c; + if (flags & HAS_NEON) vp9_dc_predictor_8x8 = vp9_dc_predictor_8x8_neon; + vp9_dc_top_predictor_16x16 = vp9_dc_top_predictor_16x16_c; + if (flags & HAS_NEON) vp9_dc_top_predictor_16x16 = vp9_dc_top_predictor_16x16_neon; + vp9_dc_top_predictor_32x32 = vp9_dc_top_predictor_32x32_c; + if (flags & HAS_NEON) vp9_dc_top_predictor_32x32 = vp9_dc_top_predictor_32x32_neon; + vp9_dc_top_predictor_4x4 = vp9_dc_top_predictor_4x4_c; + if (flags & HAS_NEON) vp9_dc_top_predictor_4x4 = vp9_dc_top_predictor_4x4_neon; + vp9_dc_top_predictor_8x8 = vp9_dc_top_predictor_8x8_c; + if (flags & HAS_NEON) vp9_dc_top_predictor_8x8 = vp9_dc_top_predictor_8x8_neon; vp9_fdct8x8 = vp9_fdct8x8_c; if (flags & HAS_NEON) vp9_fdct8x8 = vp9_fdct8x8_neon; vp9_fdct8x8_1 = vp9_fdct8x8_1_c; if (flags & HAS_NEON) vp9_fdct8x8_1 = vp9_fdct8x8_1_neon; vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; if (flags & HAS_NEON) vp9_fdct8x8_quant = vp9_fdct8x8_quant_neon; - vp9_get16x16var = vp9_get16x16var_c; - if (flags & HAS_NEON) vp9_get16x16var = vp9_get16x16var_neon; - vp9_get8x8var = vp9_get8x8var_c; - if (flags & HAS_NEON) vp9_get8x8var = vp9_get8x8var_neon; vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; if (flags & HAS_NEON) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_neon; vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; @@ -737,18 +725,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vp9_v_predictor_4x4 = vp9_v_predictor_4x4_neon; vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c; if (flags & HAS_NEON) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_neon; - vp9_variance16x16 = vp9_variance16x16_c; - if (flags & HAS_NEON) vp9_variance16x16 = vp9_variance16x16_neon; - vp9_variance32x32 = vp9_variance32x32_c; - if (flags & HAS_NEON) vp9_variance32x32 = vp9_variance32x32_neon; - vp9_variance32x64 = vp9_variance32x64_c; - if (flags & HAS_NEON) vp9_variance32x64 = vp9_variance32x64_neon; - vp9_variance64x32 = vp9_variance64x32_c; - if (flags & HAS_NEON) vp9_variance64x32 = vp9_variance64x32_neon; - vp9_variance64x64 = vp9_variance64x64_c; - if (flags & HAS_NEON) vp9_variance64x64 = vp9_variance64x64_neon; - vp9_variance8x8 = vp9_variance8x8_c; - if (flags & HAS_NEON) vp9_variance8x8 = vp9_variance8x8_neon; } #endif diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.asm b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.asm index 10c565ed61b..2fcb7d25b85 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.asm @@ -23,7 +23,6 @@ .equ HAVE_AVX2 , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_STDINT_H , 1 -.equ HAVE_ALT_TREE_LAYOUT , 0 .equ HAVE_PTHREAD_H , 1 .equ HAVE_SYS_MMAN_H , 1 .equ HAVE_UNISTD_H , 0 diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.c b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.c index 9b5a0ea3583..721ab0ae421 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=armv7-linux-gcc --enable-pic --enable-realtime-only --enable-runtime-cpu-detect --disable-edsp --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.h index 3fb03579215..a1a3dcd04a7 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 0 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h index 5d3e9df812b..f926cf29a59 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h @@ -18,6 +18,38 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_media(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -194,6 +226,55 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + void vpx_dsp_rtcd(void); #include "vpx_config.h" @@ -206,6 +287,14 @@ static void setup_rtcd_internal(void) (void)flags; + vpx_get16x16var = vpx_get16x16var_c; + if (flags & HAS_NEON) vpx_get16x16var = vpx_get16x16var_neon; + vpx_get4x4sse_cs = vpx_get4x4sse_cs_c; + if (flags & HAS_NEON) vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon; + vpx_get8x8var = vpx_get8x8var_c; + if (flags & HAS_NEON) vpx_get8x8var = vpx_get8x8var_neon; + vpx_mse16x16 = vpx_mse16x16_media; + if (flags & HAS_NEON) vpx_mse16x16 = vpx_mse16x16_neon; vpx_sad16x16 = vpx_sad16x16_media; if (flags & HAS_NEON) vpx_sad16x16 = vpx_sad16x16_neon; vpx_sad16x16x4d = vpx_sad16x16x4d_c; @@ -226,6 +315,22 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_sad8x16 = vpx_sad8x16_neon; vpx_sad8x8 = vpx_sad8x8_c; if (flags & HAS_NEON) vpx_sad8x8 = vpx_sad8x8_neon; + vpx_variance16x16 = vpx_variance16x16_media; + if (flags & HAS_NEON) vpx_variance16x16 = vpx_variance16x16_neon; + vpx_variance16x8 = vpx_variance16x8_c; + if (flags & HAS_NEON) vpx_variance16x8 = vpx_variance16x8_neon; + vpx_variance32x32 = vpx_variance32x32_c; + if (flags & HAS_NEON) vpx_variance32x32 = vpx_variance32x32_neon; + vpx_variance32x64 = vpx_variance32x64_c; + if (flags & HAS_NEON) vpx_variance32x64 = vpx_variance32x64_neon; + vpx_variance64x32 = vpx_variance64x32_c; + if (flags & HAS_NEON) vpx_variance64x32 = vpx_variance64x32_neon; + vpx_variance64x64 = vpx_variance64x64_c; + if (flags & HAS_NEON) vpx_variance64x64 = vpx_variance64x64_neon; + vpx_variance8x16 = vpx_variance8x16_c; + if (flags & HAS_NEON) vpx_variance8x16 = vpx_variance8x16_neon; + vpx_variance8x8 = vpx_variance8x8_media; + if (flags & HAS_NEON) vpx_variance8x8 = vpx_variance8x8_neon; } #endif diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp8_rtcd.h index 8710a5784d1..a7a593501ad 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp8_rtcd.h @@ -33,8 +33,7 @@ void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); void vp8_bilinear_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_neon +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_armv6 void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); void vp8_bilinear_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); @@ -136,13 +135,6 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_full_search_sad vp8_full_search_sad_c -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_neon - -unsigned int vp8_get_mb_ss_c(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_c - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_armv6 @@ -199,11 +191,6 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,in int vp8_mbuverror_c(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_c -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_neon - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_c @@ -263,9 +250,6 @@ void vp8_sixtap_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, i void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); #define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_neon(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); @@ -282,8 +266,7 @@ unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int so unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance8x8_armv6(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_neon(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_neon +#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_armv6 void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_neon(struct block *be, struct blockd *bd, int pitch); @@ -297,27 +280,6 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigne void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); #define vp8_subtract_mby vp8_subtract_mby_neon -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_neon - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_neon - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_c - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_neon - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_neon - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h index 9e445d5476f..8cc98e853c4 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h @@ -93,7 +93,8 @@ void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c +void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_neon void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c @@ -123,16 +124,19 @@ void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a #define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c +void vp9_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_neon void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c +void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_neon void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c +void vp9_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_neon void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c @@ -147,52 +151,68 @@ void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c +void vp9_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_neon void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c +void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_neon void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c +void vp9_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_neon void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c +void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_neon void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c +void vp9_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_neon void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c +void vp9_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_neon void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c +void vp9_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_neon void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c +void vp9_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_neon void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c +void vp9_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_neon void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c +void vp9_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_neon void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c +void vp9_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_neon void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c +void vp9_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_neon void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c +void vp9_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_neon void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c +void vp9_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_neon void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c +void vp9_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_neon void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c +void vp9_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_neon int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); #define vp9_denoiser_filter vp9_denoiser_filter_c @@ -257,17 +277,6 @@ int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, i void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get16x16var vp9_get16x16var_neon - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get8x8var vp9_get8x8var_neon - -unsigned int vp9_get_mb_ss_c(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_c - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_neon @@ -410,18 +419,6 @@ void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, int rows, int cols, int fli void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vp9_minmax_8x8 vp9_minmax_8x8_c -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x16 vp9_mse16x16_c - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); #define vp9_plane_add_noise vp9_plane_add_noise_c @@ -565,51 +562,6 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_neon -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_neon - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_neon - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_neon - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_neon - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_neon - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_c - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_neon - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); #define vp9_vector_var vp9_vector_var_c diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.asm b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.asm index fb523269888..b1043e656d9 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.asm @@ -23,7 +23,6 @@ .equ HAVE_AVX2 , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_STDINT_H , 1 -.equ HAVE_ALT_TREE_LAYOUT , 0 .equ HAVE_PTHREAD_H , 1 .equ HAVE_SYS_MMAN_H , 1 .equ HAVE_UNISTD_H , 0 diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.c b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.c index 58a32f0238e..e05d6991036 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=armv7-linux-gcc --enable-pic --enable-realtime-only --disable-edsp --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.h index 477014ee9c6..4aaf47ba413 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 0 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h index 955c7640f79..82aa34db379 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h @@ -18,6 +18,38 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_neon + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_neon + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_neon + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_neon + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_media(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -194,6 +226,55 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_neon + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_neon + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_neon + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_neon + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_neon + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_neon + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_neon + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_neon + void vpx_dsp_rtcd(void); #include "vpx_config.h" diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm/vp8_rtcd.h index 37e4a3529d3..6d28eb60d2f 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm/vp8_rtcd.h @@ -119,12 +119,6 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_full_search_sad vp8_full_search_sad_c -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_c - -unsigned int vp8_get_mb_ss_c(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_c - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_armv6 @@ -173,10 +167,6 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,in int vp8_mbuverror_c(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_c -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_armv6 - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_c @@ -228,9 +218,6 @@ void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int y void vp8_sixtap_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); #define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_armv6 -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); #define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_armv6 @@ -257,23 +244,6 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); #define vp8_subtract_mby vp8_subtract_mby_c -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_armv6 - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_c - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_c - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_c - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_armv6 - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_armv6 diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h index fcca3063998..d1974a6f368 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h @@ -245,15 +245,6 @@ int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, i void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get16x16var vp9_get16x16var_c - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get8x8var vp9_get8x8var_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_c - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c @@ -368,18 +359,6 @@ void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, int rows, int cols, int fli void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vp9_minmax_8x8 vp9_minmax_8x8_c -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x16 vp9_mse16x16_c - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); #define vp9_plane_add_noise vp9_plane_add_noise_c @@ -509,45 +488,6 @@ void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_c - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_c - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_c - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_c - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_c - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_c - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_c - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); #define vp9_vector_var vp9_vector_var_c diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.asm b/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.asm index cf7330af874..c0e9fb24ff9 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.asm @@ -23,7 +23,6 @@ .equ HAVE_AVX2 , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_STDINT_H , 1 -.equ HAVE_ALT_TREE_LAYOUT , 0 .equ HAVE_PTHREAD_H , 1 .equ HAVE_SYS_MMAN_H , 1 .equ HAVE_UNISTD_H , 0 diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.c b/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.c index 336b29cfd0c..0b06ef9adf1 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=armv6-linux-gcc --enable-pic --enable-realtime-only --disable-install-bins --disable-install-libs --disable-edsp --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.h b/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.h index 99f4a7cea0a..0328467f029 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/linux/arm/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 0 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h index 20948755101..ea77e14aa19 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h @@ -18,6 +18,34 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_c + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_c + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_media + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_media(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x16 vpx_sad16x16_media @@ -184,6 +212,47 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_media + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_c + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_c + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_c + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_c + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_c + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_c + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_media + void vpx_dsp_rtcd(void); #include "vpx_config.h" diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vp8_rtcd.h index 60f13cf8812..af969c3617f 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm64/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm64/vp8_rtcd.h @@ -31,8 +31,7 @@ void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst #define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_neon +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); @@ -124,13 +123,6 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_full_search_sad vp8_full_search_sad_c -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_neon - -unsigned int vp8_get_mb_ss_c(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_c - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_c @@ -178,10 +170,6 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,in int vp8_mbuverror_c(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_c -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_neon - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_c @@ -232,9 +220,6 @@ void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int y void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); #define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); #define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_c @@ -262,25 +247,6 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigne void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); #define vp8_subtract_mby vp8_subtract_mby_neon -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_neon - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_neon - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_c - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_neon - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_neon - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_neon diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h index a976f9a7731..1541cf05522 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h @@ -93,7 +93,8 @@ void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c +void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_neon void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c @@ -123,16 +124,19 @@ void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a #define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c +void vp9_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_neon void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c +void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_neon void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c +void vp9_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_neon void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c @@ -147,52 +151,68 @@ void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c +void vp9_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_neon void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c +void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_neon void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c +void vp9_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_neon void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c +void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_neon void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c +void vp9_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_neon void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c +void vp9_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_neon void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c +void vp9_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_neon void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c +void vp9_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_neon void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c +void vp9_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_neon void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c +void vp9_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_neon void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c +void vp9_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_neon void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c +void vp9_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_neon void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c +void vp9_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_neon void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c +void vp9_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_neon void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c +void vp9_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_neon void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c +void vp9_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_neon int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); #define vp9_denoiser_filter vp9_denoiser_filter_c @@ -257,17 +277,6 @@ int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, i void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get16x16var vp9_get16x16var_neon - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get8x8var vp9_get8x8var_neon - -unsigned int vp9_get_mb_ss_c(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_c - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_neon @@ -402,18 +411,6 @@ void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, int rows, int cols, int fli void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vp9_minmax_8x8 vp9_minmax_8x8_c -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x16 vp9_mse16x16_c - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); #define vp9_plane_add_noise vp9_plane_add_noise_c @@ -557,51 +554,6 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_neon -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_neon - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_neon - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_neon - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_neon - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_neon - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_c - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_neon - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); #define vp9_vector_var vp9_vector_var_c diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.asm b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.asm index c7ef4366028..8d1201d83be 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.asm @@ -23,7 +23,6 @@ .equ HAVE_AVX2 , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_STDINT_H , 1 -.equ HAVE_ALT_TREE_LAYOUT , 0 .equ HAVE_PTHREAD_H , 1 .equ HAVE_SYS_MMAN_H , 1 .equ HAVE_UNISTD_H , 0 diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.c b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.c index 84cebcdae2f..6674d94aed4 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--force-target=armv8-linux-gcc --enable-pic --enable-realtime-only --disable-edsp --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.h b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.h index 0409b886a57..f54e589ef32 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 0 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h index a0b6898b29b..af0741cca13 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h @@ -18,6 +18,37 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_neon + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_neon + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_neon + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_neon + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x16 vpx_sad16x16_neon @@ -193,6 +224,53 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_neon + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_neon + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_neon + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_neon + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_neon + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_neon + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_neon + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_neon + void vpx_dsp_rtcd(void); #include "vpx_config.h" diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vp8_rtcd.h index 1bd7496fe55..7a33ed2effb 100644 --- a/chromium/third_party/libvpx/source/config/linux/generic/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/generic/vp8_rtcd.h @@ -107,12 +107,6 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_full_search_sad vp8_full_search_sad_c -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_c - -unsigned int vp8_get_mb_ss_c(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_c - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_c @@ -152,9 +146,6 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,in int vp8_mbuverror_c(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_c -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_c - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_c @@ -197,9 +188,6 @@ void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int y void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); #define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); #define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_c @@ -224,21 +212,6 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); #define vp8_subtract_mby vp8_subtract_mby_c -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_c - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_c - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_c - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_c - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_c - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h index 84bd45f2876..8cf86073577 100644 --- a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h @@ -245,15 +245,6 @@ int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, i void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get16x16var vp9_get16x16var_c - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get8x8var vp9_get8x8var_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_c - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c @@ -368,18 +359,6 @@ void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, int rows, int cols, int fli void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vp9_minmax_8x8 vp9_minmax_8x8_c -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x16 vp9_mse16x16_c - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); #define vp9_plane_add_noise vp9_plane_add_noise_c @@ -509,45 +488,6 @@ void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_c - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_c - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_c - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_c - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_c - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_c - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_c - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); #define vp9_vector_var vp9_vector_var_c diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.asm b/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.asm index 20db06465bf..a2a18fdc7fd 100644 --- a/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.asm @@ -23,7 +23,6 @@ .equ HAVE_AVX2 , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_STDINT_H , 1 -.equ HAVE_ALT_TREE_LAYOUT , 0 .equ HAVE_PTHREAD_H , 1 .equ HAVE_SYS_MMAN_H , 1 .equ HAVE_UNISTD_H , 0 diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.c b/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.c index 0e8d7265f36..2a18a45e3e3 100644 --- a/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=generic-gnu --enable-pic --enable-realtime-only --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.h b/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.h index ec8222e18c2..290508c1c88 100644 --- a/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/linux/generic/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 0 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h index 86c5b4425dc..f086946da23 100644 --- a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h @@ -18,6 +18,33 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_c + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_c + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_c + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x16 vpx_sad16x16_c @@ -183,6 +210,45 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_c + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_c + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_c + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_c + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_c + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_c + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_c + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_c + void vpx_dsp_rtcd(void); #include "vpx_config.h" diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h index b60e7ac7301..38fe6d360b0 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h @@ -74,10 +74,10 @@ void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); RTCD_EXTERN void (*vp8_clear_system_state)(); -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); @@ -147,15 +147,6 @@ int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd * int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); - -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -RTCD_EXTERN unsigned int (*vp8_get_mb_ss)(const short *); - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_c @@ -218,11 +209,6 @@ int vp8_mbuverror_mmx(struct macroblock *mb); int vp8_mbuverror_xmm(struct macroblock *mb); RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); @@ -290,11 +276,6 @@ void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, in void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_mse16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); @@ -337,31 +318,6 @@ void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsig void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance4x4)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); @@ -448,11 +404,6 @@ static void setup_rtcd_internal(void) vp8_full_search_sad = vp8_full_search_sad_c; if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; - if (flags & HAS_MMX) vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; - vp8_get_mb_ss = vp8_get_mb_ss_c; - if (flags & HAS_MMX) vp8_get_mb_ss = vp8_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp8_get_mb_ss = vp8_get_mb_ss_sse2; vp8_loop_filter_bh = vp8_loop_filter_bh_c; if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; @@ -488,9 +439,6 @@ static void setup_rtcd_internal(void) vp8_mbuverror = vp8_mbuverror_c; if (flags & HAS_MMX) vp8_mbuverror = vp8_mbuverror_mmx; if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_xmm; - vp8_mse16x16 = vp8_mse16x16_c; - if (flags & HAS_MMX) vp8_mse16x16 = vp8_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_mse16x16 = vp8_mse16x16_wmt; vp8_plane_add_noise = vp8_plane_add_noise_c; if (flags & HAS_MMX) vp8_plane_add_noise = vp8_plane_add_noise_mmx; if (flags & HAS_SSE2) vp8_plane_add_noise = vp8_plane_add_noise_wmt; @@ -529,9 +477,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; - vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_wmt; vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; if (flags & HAS_MMX) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; if (flags & HAS_SSE2) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; @@ -558,21 +503,6 @@ static void setup_rtcd_internal(void) vp8_subtract_mby = vp8_subtract_mby_c; if (flags & HAS_MMX) vp8_subtract_mby = vp8_subtract_mby_mmx; if (flags & HAS_SSE2) vp8_subtract_mby = vp8_subtract_mby_sse2; - vp8_variance16x16 = vp8_variance16x16_c; - if (flags & HAS_MMX) vp8_variance16x16 = vp8_variance16x16_mmx; - if (flags & HAS_SSE2) vp8_variance16x16 = vp8_variance16x16_wmt; - vp8_variance16x8 = vp8_variance16x8_c; - if (flags & HAS_MMX) vp8_variance16x8 = vp8_variance16x8_mmx; - if (flags & HAS_SSE2) vp8_variance16x8 = vp8_variance16x8_wmt; - vp8_variance4x4 = vp8_variance4x4_c; - if (flags & HAS_MMX) vp8_variance4x4 = vp8_variance4x4_mmx; - if (flags & HAS_SSE2) vp8_variance4x4 = vp8_variance4x4_wmt; - vp8_variance8x16 = vp8_variance8x16_c; - if (flags & HAS_MMX) vp8_variance8x16 = vp8_variance8x16_mmx; - if (flags & HAS_SSE2) vp8_variance8x16 = vp8_variance8x16_wmt; - vp8_variance8x8 = vp8_variance8x8_c; - if (flags & HAS_MMX) vp8_variance8x8 = vp8_variance8x8_mmx; - if (flags & HAS_SSE2) vp8_variance8x8 = vp8_variance8x8_wmt; vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h index 7425dc281bd..1da20dc1bcc 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h @@ -116,7 +116,8 @@ void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c +void vp9_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -320,19 +321,6 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_mmx(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride); -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -484,23 +472,6 @@ void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); RTCD_EXTERN void (*vp9_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); void vp9_plane_add_noise_wmt(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); RTCD_EXTERN void (*vp9_plane_add_noise)(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); @@ -700,63 +671,6 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl); RTCD_EXTERN int (*vp9_vector_var)(int16_t const *ref, int16_t const *src, const int bwl); @@ -807,6 +721,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_convolve_copy = vp9_convolve_copy_sse2; vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_ssse3; vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; @@ -907,13 +823,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; vp9_fwht4x4 = vp9_fwht4x4_c; if (flags & HAS_MMX) vp9_fwht4x4 = vp9_fwht4x4_mmx; - vp9_get16x16var = vp9_get16x16var_c; - if (flags & HAS_SSE2) vp9_get16x16var = vp9_get16x16var_sse2; - if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; - vp9_get8x8var = vp9_get8x8var_c; - if (flags & HAS_SSE2) vp9_get8x8var = vp9_get8x8var_sse2; - vp9_get_mb_ss = vp9_get_mb_ss_c; - if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; @@ -987,15 +896,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_mbpost_proc_down = vp9_mbpost_proc_down_xmm; vp9_minmax_8x8 = vp9_minmax_8x8_c; if (flags & HAS_SSE2) vp9_minmax_8x8 = vp9_minmax_8x8_sse2; - vp9_mse16x16 = vp9_mse16x16_c; - if (flags & HAS_SSE2) vp9_mse16x16 = vp9_mse16x16_sse2; - if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; - vp9_mse16x8 = vp9_mse16x8_c; - if (flags & HAS_SSE2) vp9_mse16x8 = vp9_mse16x8_sse2; - vp9_mse8x16 = vp9_mse8x16_c; - if (flags & HAS_SSE2) vp9_mse8x16 = vp9_mse8x16_sse2; - vp9_mse8x8 = vp9_mse8x8_c; - if (flags & HAS_SSE2) vp9_mse8x8 = vp9_mse8x8_sse2; vp9_plane_add_noise = vp9_plane_add_noise_c; if (flags & HAS_SSE2) vp9_plane_add_noise = vp9_plane_add_noise_wmt; vp9_post_proc_down_and_across = vp9_post_proc_down_and_across_c; @@ -1106,37 +1006,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE) vp9_v_predictor_4x4 = vp9_v_predictor_4x4_sse; vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c; if (flags & HAS_SSE) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_sse; - vp9_variance16x16 = vp9_variance16x16_c; - if (flags & HAS_SSE2) vp9_variance16x16 = vp9_variance16x16_sse2; - if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; - vp9_variance16x32 = vp9_variance16x32_c; - if (flags & HAS_SSE2) vp9_variance16x32 = vp9_variance16x32_sse2; - vp9_variance16x8 = vp9_variance16x8_c; - if (flags & HAS_SSE2) vp9_variance16x8 = vp9_variance16x8_sse2; - vp9_variance32x16 = vp9_variance32x16_c; - if (flags & HAS_SSE2) vp9_variance32x16 = vp9_variance32x16_sse2; - if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; - vp9_variance32x32 = vp9_variance32x32_c; - if (flags & HAS_SSE2) vp9_variance32x32 = vp9_variance32x32_sse2; - if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; - vp9_variance32x64 = vp9_variance32x64_c; - if (flags & HAS_SSE2) vp9_variance32x64 = vp9_variance32x64_sse2; - vp9_variance4x4 = vp9_variance4x4_c; - if (flags & HAS_SSE2) vp9_variance4x4 = vp9_variance4x4_sse2; - vp9_variance4x8 = vp9_variance4x8_c; - if (flags & HAS_SSE2) vp9_variance4x8 = vp9_variance4x8_sse2; - vp9_variance64x32 = vp9_variance64x32_c; - if (flags & HAS_SSE2) vp9_variance64x32 = vp9_variance64x32_sse2; - if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; - vp9_variance64x64 = vp9_variance64x64_c; - if (flags & HAS_SSE2) vp9_variance64x64 = vp9_variance64x64_sse2; - if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; - vp9_variance8x16 = vp9_variance8x16_c; - if (flags & HAS_SSE2) vp9_variance8x16 = vp9_variance8x16_sse2; - vp9_variance8x4 = vp9_variance8x4_c; - if (flags & HAS_SSE2) vp9_variance8x4 = vp9_variance8x4_sse2; - vp9_variance8x8 = vp9_variance8x8_c; - if (flags & HAS_SSE2) vp9_variance8x8 = vp9_variance8x8_sse2; vp9_vector_var = vp9_vector_var_c; if (flags & HAS_SSE2) vp9_vector_var = vp9_vector_var_sse2; } diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.asm b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.asm index b9418dda315..9550f440153 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.asm @@ -20,7 +20,6 @@ %define HAVE_AVX2 1 %define HAVE_VPX_PORTS 1 %define HAVE_STDINT_H 1 -%define HAVE_ALT_TREE_LAYOUT 0 %define HAVE_PTHREAD_H 1 %define HAVE_SYS_MMAN_H 1 %define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.c b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.c index f254f6d7167..08b58cc7244 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=x86-linux-gcc --disable-ccache --enable-pic --enable-realtime-only --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.h b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.h index 3715b4aef3e..c20daf6517f 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 1 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h index 8cc2fa5b737..32ee77e25ce 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h @@ -18,6 +18,45 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_mmx(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *); + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -251,6 +290,68 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + void vpx_dsp_rtcd(void); #ifdef RTCD_C @@ -261,6 +362,25 @@ static void setup_rtcd_internal(void) (void)flags; + vpx_get16x16var = vpx_get16x16var_c; + if (flags & HAS_SSE2) vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_get8x8var = vpx_get8x8var_c; + if (flags & HAS_MMX) vpx_get8x8var = vpx_get8x8var_mmx; + if (flags & HAS_SSE2) vpx_get8x8var = vpx_get8x8var_sse2; + vpx_get_mb_ss = vpx_get_mb_ss_c; + if (flags & HAS_MMX) vpx_get_mb_ss = vpx_get_mb_ss_mmx; + if (flags & HAS_SSE2) vpx_get_mb_ss = vpx_get_mb_ss_sse2; + vpx_mse16x16 = vpx_mse16x16_c; + if (flags & HAS_MMX) vpx_mse16x16 = vpx_mse16x16_mmx; + if (flags & HAS_SSE2) vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_c; + if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2; + vpx_mse8x16 = vpx_mse8x16_c; + if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2; + vpx_mse8x8 = vpx_mse8x8_c; + if (flags & HAS_SSE2) vpx_mse8x8 = vpx_mse8x8_sse2; vpx_sad16x16 = vpx_sad16x16_c; if (flags & HAS_MMX) vpx_sad16x16 = vpx_sad16x16_mmx; if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2; @@ -378,6 +498,42 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2; vpx_sad8x8x8 = vpx_sad8x8x8_c; if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + vpx_variance16x16 = vpx_variance16x16_c; + if (flags & HAS_MMX) vpx_variance16x16 = vpx_variance16x16_mmx; + if (flags & HAS_SSE2) vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_c; + if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2; + vpx_variance16x8 = vpx_variance16x8_c; + if (flags & HAS_MMX) vpx_variance16x8 = vpx_variance16x8_mmx; + if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2; + vpx_variance32x16 = vpx_variance32x16_c; + if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_c; + if (flags & HAS_SSE2) vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_c; + if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2; + vpx_variance4x4 = vpx_variance4x4_c; + if (flags & HAS_MMX) vpx_variance4x4 = vpx_variance4x4_mmx; + if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2; + vpx_variance4x8 = vpx_variance4x8_c; + if (flags & HAS_SSE2) vpx_variance4x8 = vpx_variance4x8_sse2; + vpx_variance64x32 = vpx_variance64x32_c; + if (flags & HAS_SSE2) vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_c; + if (flags & HAS_SSE2) vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_c; + if (flags & HAS_MMX) vpx_variance8x16 = vpx_variance8x16_mmx; + if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + vpx_variance8x4 = vpx_variance8x4_c; + if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + vpx_variance8x8 = vpx_variance8x8_c; + if (flags & HAS_MMX) vpx_variance8x8 = vpx_variance8x8_mmx; + if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; } #endif diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vp8_rtcd.h index ec819184a76..3befb52db1e 100644 --- a/chromium/third_party/libvpx/source/config/linux/mips64el/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vp8_rtcd.h @@ -107,12 +107,6 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_full_search_sad vp8_full_search_sad_c -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_c - -unsigned int vp8_get_mb_ss_c(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_c - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_c @@ -152,9 +146,6 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,in int vp8_mbuverror_c(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_c -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_c - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_c @@ -197,9 +188,6 @@ void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int y void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); #define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); #define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_c @@ -227,21 +215,6 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigne void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); #define vp8_temporal_filter_apply vp8_temporal_filter_apply_c -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_c - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_c - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_c - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_c - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_c - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h index 84bd45f2876..8cf86073577 100644 --- a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h @@ -245,15 +245,6 @@ int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, i void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get16x16var vp9_get16x16var_c - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get8x8var vp9_get8x8var_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_c - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c @@ -368,18 +359,6 @@ void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, int rows, int cols, int fli void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vp9_minmax_8x8 vp9_minmax_8x8_c -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x16 vp9_mse16x16_c - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); #define vp9_plane_add_noise vp9_plane_add_noise_c @@ -509,45 +488,6 @@ void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_c - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_c - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_c - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_c - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_c - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_c - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_c - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); #define vp9_vector_var vp9_vector_var_c diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_config.c b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_config.c index 1000fbb39b3..9e6876b5859 100644 --- a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=mips64-linux-gcc --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_config.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_config.h index 194dfa5d91d..a9e7f3ac8d9 100644 --- a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 0 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h index 86c5b4425dc..f086946da23 100644 --- a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h @@ -18,6 +18,33 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_c + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_c + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_c + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x16 vpx_sad16x16_c @@ -183,6 +210,45 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_c + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_c + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_c + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_c + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_c + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_c + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_c + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_c + void vpx_dsp_rtcd(void); #include "vpx_config.h" diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vp8_rtcd.h index ec819184a76..3befb52db1e 100644 --- a/chromium/third_party/libvpx/source/config/linux/mipsel/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vp8_rtcd.h @@ -107,12 +107,6 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_full_search_sad vp8_full_search_sad_c -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_c - -unsigned int vp8_get_mb_ss_c(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_c - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_c @@ -152,9 +146,6 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,in int vp8_mbuverror_c(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_c -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_c - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_c @@ -197,9 +188,6 @@ void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int y void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); #define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); #define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_c @@ -227,21 +215,6 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigne void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); #define vp8_temporal_filter_apply vp8_temporal_filter_apply_c -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_c - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_c - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_c - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_c - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_c - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h index 84bd45f2876..8cf86073577 100644 --- a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h @@ -245,15 +245,6 @@ int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, i void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get16x16var vp9_get16x16var_c - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get8x8var vp9_get8x8var_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_c - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c @@ -368,18 +359,6 @@ void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, int rows, int cols, int fli void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vp9_minmax_8x8 vp9_minmax_8x8_c -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x16 vp9_mse16x16_c - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); #define vp9_plane_add_noise vp9_plane_add_noise_c @@ -509,45 +488,6 @@ void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_c - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_c - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_c - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_c - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_c - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_c - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_c - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); #define vp9_vector_var vp9_vector_var_c diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_config.c b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_config.c index 7f5c1f6e858..2c1375d883a 100644 --- a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=mips32-linux-gcc --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_config.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_config.h index 1783d40cca5..4210a7c868d 100644 --- a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 0 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h index 86c5b4425dc..f086946da23 100644 --- a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h @@ -18,6 +18,33 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_c + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_c + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_c + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x16 vpx_sad16x16_c @@ -183,6 +210,45 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_c + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_c + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_c + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_c + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_c + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_c + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_c + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_c + void vpx_dsp_rtcd(void); #include "vpx_config.h" diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h index 9989eb09995..5285ac71055 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h @@ -74,10 +74,10 @@ void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); #define vp8_clear_system_state vpx_reset_mmx_state -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); @@ -147,15 +147,6 @@ int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd * int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx - -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_sse2 - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_c @@ -218,11 +209,6 @@ int vp8_mbuverror_mmx(struct macroblock *mb); int vp8_mbuverror_xmm(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_xmm -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_wmt - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); @@ -290,11 +276,6 @@ void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, in void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); @@ -337,31 +318,6 @@ void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsig void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); #define vp8_subtract_mby vp8_subtract_mby_sse2 -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_wmt - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_wmt - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_wmt - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_wmt - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_wmt - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h index 10a6b8401db..0834e762f34 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h @@ -116,7 +116,8 @@ void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c +void vp9_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -321,19 +322,6 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_mmx(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_mmx -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get8x8var vp9_get8x8var_sse2 - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_sse2 - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -488,23 +476,6 @@ void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vp9_minmax_8x8 vp9_minmax_8x8_sse2 -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_sse2 - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_sse2 - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_sse2 - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); void vp9_plane_add_noise_wmt(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); #define vp9_plane_add_noise vp9_plane_add_noise_wmt @@ -709,63 +680,6 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_sse2 - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_sse2 - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_sse2 - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_sse2 - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_sse2 - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_sse2 - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_sse2 - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_sse2 - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl); #define vp9_vector_var vp9_vector_var_sse2 @@ -799,6 +713,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vp9_convolve8_vert = vp9_convolve8_vert_avx2; vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_ssse3; vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; @@ -838,8 +754,6 @@ static void setup_rtcd_internal(void) vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - vp9_get16x16var = vp9_get16x16var_sse2; - if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; @@ -856,8 +770,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vp9_idct8x8_64_add = vp9_idct8x8_64_add_ssse3; vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_sse2; if (flags & HAS_AVX2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_avx2; - vp9_mse16x16 = vp9_mse16x16_sse2; - if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; vp9_quantize_b = vp9_quantize_b_sse2; if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3; vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c; @@ -922,16 +834,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; - vp9_variance16x16 = vp9_variance16x16_sse2; - if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; - vp9_variance32x16 = vp9_variance32x16_sse2; - if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; - vp9_variance32x32 = vp9_variance32x32_sse2; - if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; - vp9_variance64x32 = vp9_variance64x32_sse2; - if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; - vp9_variance64x64 = vp9_variance64x64_sse2; - if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; } #endif diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.asm b/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.asm index 68a0d73808d..0c4b550b78a 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.asm @@ -20,7 +20,6 @@ %define HAVE_AVX2 1 %define HAVE_VPX_PORTS 1 %define HAVE_STDINT_H 1 -%define HAVE_ALT_TREE_LAYOUT 0 %define HAVE_PTHREAD_H 1 %define HAVE_SYS_MMAN_H 1 %define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.c b/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.c index 1221f4849ea..4f190b1958d 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=x86_64-linux-gcc --disable-ccache --enable-pic --enable-realtime-only --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.h b/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.h index 72f336395ce..27d91efdbd4 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/linux/x64/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 1 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h index b5df5e08c57..d93c56eb765 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h @@ -18,6 +18,45 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_sse2 + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_mmx(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_sse2 + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_sse2 + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_sse2 + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_sse2 + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -251,6 +290,68 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_sse2 + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_sse2 + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_sse2 + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_sse2 + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_sse2 + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_sse2 + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_sse2 + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_sse2 + void vpx_dsp_rtcd(void); #ifdef RTCD_C @@ -261,6 +362,10 @@ static void setup_rtcd_internal(void) (void)flags; + vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; vpx_sad16x16x3 = vpx_sad16x16x3_c; if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3; if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3; @@ -307,6 +412,16 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3; vpx_sad8x8x8 = vpx_sad8x8x8_c; if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; } #endif diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h index b60e7ac7301..38fe6d360b0 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h @@ -74,10 +74,10 @@ void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); RTCD_EXTERN void (*vp8_clear_system_state)(); -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); @@ -147,15 +147,6 @@ int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd * int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); - -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -RTCD_EXTERN unsigned int (*vp8_get_mb_ss)(const short *); - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_c @@ -218,11 +209,6 @@ int vp8_mbuverror_mmx(struct macroblock *mb); int vp8_mbuverror_xmm(struct macroblock *mb); RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); @@ -290,11 +276,6 @@ void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, in void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_mse16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); @@ -337,31 +318,6 @@ void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsig void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance4x4)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); @@ -448,11 +404,6 @@ static void setup_rtcd_internal(void) vp8_full_search_sad = vp8_full_search_sad_c; if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; - if (flags & HAS_MMX) vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; - vp8_get_mb_ss = vp8_get_mb_ss_c; - if (flags & HAS_MMX) vp8_get_mb_ss = vp8_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp8_get_mb_ss = vp8_get_mb_ss_sse2; vp8_loop_filter_bh = vp8_loop_filter_bh_c; if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; @@ -488,9 +439,6 @@ static void setup_rtcd_internal(void) vp8_mbuverror = vp8_mbuverror_c; if (flags & HAS_MMX) vp8_mbuverror = vp8_mbuverror_mmx; if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_xmm; - vp8_mse16x16 = vp8_mse16x16_c; - if (flags & HAS_MMX) vp8_mse16x16 = vp8_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_mse16x16 = vp8_mse16x16_wmt; vp8_plane_add_noise = vp8_plane_add_noise_c; if (flags & HAS_MMX) vp8_plane_add_noise = vp8_plane_add_noise_mmx; if (flags & HAS_SSE2) vp8_plane_add_noise = vp8_plane_add_noise_wmt; @@ -529,9 +477,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; - vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_wmt; vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; if (flags & HAS_MMX) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; if (flags & HAS_SSE2) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; @@ -558,21 +503,6 @@ static void setup_rtcd_internal(void) vp8_subtract_mby = vp8_subtract_mby_c; if (flags & HAS_MMX) vp8_subtract_mby = vp8_subtract_mby_mmx; if (flags & HAS_SSE2) vp8_subtract_mby = vp8_subtract_mby_sse2; - vp8_variance16x16 = vp8_variance16x16_c; - if (flags & HAS_MMX) vp8_variance16x16 = vp8_variance16x16_mmx; - if (flags & HAS_SSE2) vp8_variance16x16 = vp8_variance16x16_wmt; - vp8_variance16x8 = vp8_variance16x8_c; - if (flags & HAS_MMX) vp8_variance16x8 = vp8_variance16x8_mmx; - if (flags & HAS_SSE2) vp8_variance16x8 = vp8_variance16x8_wmt; - vp8_variance4x4 = vp8_variance4x4_c; - if (flags & HAS_MMX) vp8_variance4x4 = vp8_variance4x4_mmx; - if (flags & HAS_SSE2) vp8_variance4x4 = vp8_variance4x4_wmt; - vp8_variance8x16 = vp8_variance8x16_c; - if (flags & HAS_MMX) vp8_variance8x16 = vp8_variance8x16_mmx; - if (flags & HAS_SSE2) vp8_variance8x16 = vp8_variance8x16_wmt; - vp8_variance8x8 = vp8_variance8x8_c; - if (flags & HAS_MMX) vp8_variance8x8 = vp8_variance8x8_mmx; - if (flags & HAS_SSE2) vp8_variance8x8 = vp8_variance8x8_wmt; vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h index 7425dc281bd..1da20dc1bcc 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h @@ -116,7 +116,8 @@ void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c +void vp9_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -320,19 +321,6 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_mmx(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride); -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -484,23 +472,6 @@ void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); RTCD_EXTERN void (*vp9_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); void vp9_plane_add_noise_wmt(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); RTCD_EXTERN void (*vp9_plane_add_noise)(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); @@ -700,63 +671,6 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl); RTCD_EXTERN int (*vp9_vector_var)(int16_t const *ref, int16_t const *src, const int bwl); @@ -807,6 +721,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_convolve_copy = vp9_convolve_copy_sse2; vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_ssse3; vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; @@ -907,13 +823,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; vp9_fwht4x4 = vp9_fwht4x4_c; if (flags & HAS_MMX) vp9_fwht4x4 = vp9_fwht4x4_mmx; - vp9_get16x16var = vp9_get16x16var_c; - if (flags & HAS_SSE2) vp9_get16x16var = vp9_get16x16var_sse2; - if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; - vp9_get8x8var = vp9_get8x8var_c; - if (flags & HAS_SSE2) vp9_get8x8var = vp9_get8x8var_sse2; - vp9_get_mb_ss = vp9_get_mb_ss_c; - if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; @@ -987,15 +896,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_mbpost_proc_down = vp9_mbpost_proc_down_xmm; vp9_minmax_8x8 = vp9_minmax_8x8_c; if (flags & HAS_SSE2) vp9_minmax_8x8 = vp9_minmax_8x8_sse2; - vp9_mse16x16 = vp9_mse16x16_c; - if (flags & HAS_SSE2) vp9_mse16x16 = vp9_mse16x16_sse2; - if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; - vp9_mse16x8 = vp9_mse16x8_c; - if (flags & HAS_SSE2) vp9_mse16x8 = vp9_mse16x8_sse2; - vp9_mse8x16 = vp9_mse8x16_c; - if (flags & HAS_SSE2) vp9_mse8x16 = vp9_mse8x16_sse2; - vp9_mse8x8 = vp9_mse8x8_c; - if (flags & HAS_SSE2) vp9_mse8x8 = vp9_mse8x8_sse2; vp9_plane_add_noise = vp9_plane_add_noise_c; if (flags & HAS_SSE2) vp9_plane_add_noise = vp9_plane_add_noise_wmt; vp9_post_proc_down_and_across = vp9_post_proc_down_and_across_c; @@ -1106,37 +1006,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE) vp9_v_predictor_4x4 = vp9_v_predictor_4x4_sse; vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c; if (flags & HAS_SSE) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_sse; - vp9_variance16x16 = vp9_variance16x16_c; - if (flags & HAS_SSE2) vp9_variance16x16 = vp9_variance16x16_sse2; - if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; - vp9_variance16x32 = vp9_variance16x32_c; - if (flags & HAS_SSE2) vp9_variance16x32 = vp9_variance16x32_sse2; - vp9_variance16x8 = vp9_variance16x8_c; - if (flags & HAS_SSE2) vp9_variance16x8 = vp9_variance16x8_sse2; - vp9_variance32x16 = vp9_variance32x16_c; - if (flags & HAS_SSE2) vp9_variance32x16 = vp9_variance32x16_sse2; - if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; - vp9_variance32x32 = vp9_variance32x32_c; - if (flags & HAS_SSE2) vp9_variance32x32 = vp9_variance32x32_sse2; - if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; - vp9_variance32x64 = vp9_variance32x64_c; - if (flags & HAS_SSE2) vp9_variance32x64 = vp9_variance32x64_sse2; - vp9_variance4x4 = vp9_variance4x4_c; - if (flags & HAS_SSE2) vp9_variance4x4 = vp9_variance4x4_sse2; - vp9_variance4x8 = vp9_variance4x8_c; - if (flags & HAS_SSE2) vp9_variance4x8 = vp9_variance4x8_sse2; - vp9_variance64x32 = vp9_variance64x32_c; - if (flags & HAS_SSE2) vp9_variance64x32 = vp9_variance64x32_sse2; - if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; - vp9_variance64x64 = vp9_variance64x64_c; - if (flags & HAS_SSE2) vp9_variance64x64 = vp9_variance64x64_sse2; - if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; - vp9_variance8x16 = vp9_variance8x16_c; - if (flags & HAS_SSE2) vp9_variance8x16 = vp9_variance8x16_sse2; - vp9_variance8x4 = vp9_variance8x4_c; - if (flags & HAS_SSE2) vp9_variance8x4 = vp9_variance8x4_sse2; - vp9_variance8x8 = vp9_variance8x8_c; - if (flags & HAS_SSE2) vp9_variance8x8 = vp9_variance8x8_sse2; vp9_vector_var = vp9_vector_var_c; if (flags & HAS_SSE2) vp9_vector_var = vp9_vector_var_sse2; } diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.asm b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.asm index b9418dda315..9550f440153 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.asm @@ -20,7 +20,6 @@ %define HAVE_AVX2 1 %define HAVE_VPX_PORTS 1 %define HAVE_STDINT_H 1 -%define HAVE_ALT_TREE_LAYOUT 0 %define HAVE_PTHREAD_H 1 %define HAVE_SYS_MMAN_H 1 %define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.c b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.c index d8cb73c99ab..90849ad6fbf 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=x86-darwin9-gcc --enable-pic --enable-realtime-only --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.h b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.h index 3715b4aef3e..c20daf6517f 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 1 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h index 8cc2fa5b737..32ee77e25ce 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h @@ -18,6 +18,45 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_mmx(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *); + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -251,6 +290,68 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + void vpx_dsp_rtcd(void); #ifdef RTCD_C @@ -261,6 +362,25 @@ static void setup_rtcd_internal(void) (void)flags; + vpx_get16x16var = vpx_get16x16var_c; + if (flags & HAS_SSE2) vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_get8x8var = vpx_get8x8var_c; + if (flags & HAS_MMX) vpx_get8x8var = vpx_get8x8var_mmx; + if (flags & HAS_SSE2) vpx_get8x8var = vpx_get8x8var_sse2; + vpx_get_mb_ss = vpx_get_mb_ss_c; + if (flags & HAS_MMX) vpx_get_mb_ss = vpx_get_mb_ss_mmx; + if (flags & HAS_SSE2) vpx_get_mb_ss = vpx_get_mb_ss_sse2; + vpx_mse16x16 = vpx_mse16x16_c; + if (flags & HAS_MMX) vpx_mse16x16 = vpx_mse16x16_mmx; + if (flags & HAS_SSE2) vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_c; + if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2; + vpx_mse8x16 = vpx_mse8x16_c; + if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2; + vpx_mse8x8 = vpx_mse8x8_c; + if (flags & HAS_SSE2) vpx_mse8x8 = vpx_mse8x8_sse2; vpx_sad16x16 = vpx_sad16x16_c; if (flags & HAS_MMX) vpx_sad16x16 = vpx_sad16x16_mmx; if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2; @@ -378,6 +498,42 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2; vpx_sad8x8x8 = vpx_sad8x8x8_c; if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + vpx_variance16x16 = vpx_variance16x16_c; + if (flags & HAS_MMX) vpx_variance16x16 = vpx_variance16x16_mmx; + if (flags & HAS_SSE2) vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_c; + if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2; + vpx_variance16x8 = vpx_variance16x8_c; + if (flags & HAS_MMX) vpx_variance16x8 = vpx_variance16x8_mmx; + if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2; + vpx_variance32x16 = vpx_variance32x16_c; + if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_c; + if (flags & HAS_SSE2) vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_c; + if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2; + vpx_variance4x4 = vpx_variance4x4_c; + if (flags & HAS_MMX) vpx_variance4x4 = vpx_variance4x4_mmx; + if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2; + vpx_variance4x8 = vpx_variance4x8_c; + if (flags & HAS_SSE2) vpx_variance4x8 = vpx_variance4x8_sse2; + vpx_variance64x32 = vpx_variance64x32_c; + if (flags & HAS_SSE2) vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_c; + if (flags & HAS_SSE2) vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_c; + if (flags & HAS_MMX) vpx_variance8x16 = vpx_variance8x16_mmx; + if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + vpx_variance8x4 = vpx_variance8x4_c; + if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + vpx_variance8x8 = vpx_variance8x8_c; + if (flags & HAS_MMX) vpx_variance8x8 = vpx_variance8x8_mmx; + if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; } #endif diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h index 9989eb09995..5285ac71055 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h @@ -74,10 +74,10 @@ void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); #define vp8_clear_system_state vpx_reset_mmx_state -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); @@ -147,15 +147,6 @@ int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd * int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx - -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_sse2 - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_c @@ -218,11 +209,6 @@ int vp8_mbuverror_mmx(struct macroblock *mb); int vp8_mbuverror_xmm(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_xmm -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_wmt - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); @@ -290,11 +276,6 @@ void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, in void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); @@ -337,31 +318,6 @@ void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsig void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); #define vp8_subtract_mby vp8_subtract_mby_sse2 -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_wmt - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_wmt - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_wmt - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_wmt - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_wmt - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h index 10a6b8401db..0834e762f34 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h @@ -116,7 +116,8 @@ void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c +void vp9_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -321,19 +322,6 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_mmx(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_mmx -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get8x8var vp9_get8x8var_sse2 - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_sse2 - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -488,23 +476,6 @@ void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vp9_minmax_8x8 vp9_minmax_8x8_sse2 -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_sse2 - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_sse2 - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_sse2 - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); void vp9_plane_add_noise_wmt(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); #define vp9_plane_add_noise vp9_plane_add_noise_wmt @@ -709,63 +680,6 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_sse2 - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_sse2 - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_sse2 - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_sse2 - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_sse2 - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_sse2 - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_sse2 - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_sse2 - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl); #define vp9_vector_var vp9_vector_var_sse2 @@ -799,6 +713,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vp9_convolve8_vert = vp9_convolve8_vert_avx2; vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_ssse3; vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; @@ -838,8 +754,6 @@ static void setup_rtcd_internal(void) vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - vp9_get16x16var = vp9_get16x16var_sse2; - if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; @@ -856,8 +770,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vp9_idct8x8_64_add = vp9_idct8x8_64_add_ssse3; vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_sse2; if (flags & HAS_AVX2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_avx2; - vp9_mse16x16 = vp9_mse16x16_sse2; - if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; vp9_quantize_b = vp9_quantize_b_sse2; if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3; vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c; @@ -922,16 +834,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; - vp9_variance16x16 = vp9_variance16x16_sse2; - if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; - vp9_variance32x16 = vp9_variance32x16_sse2; - if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; - vp9_variance32x32 = vp9_variance32x32_sse2; - if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; - vp9_variance64x32 = vp9_variance64x32_sse2; - if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; - vp9_variance64x64 = vp9_variance64x64_sse2; - if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; } #endif diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.asm b/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.asm index 68a0d73808d..0c4b550b78a 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.asm @@ -20,7 +20,6 @@ %define HAVE_AVX2 1 %define HAVE_VPX_PORTS 1 %define HAVE_STDINT_H 1 -%define HAVE_ALT_TREE_LAYOUT 0 %define HAVE_PTHREAD_H 1 %define HAVE_SYS_MMAN_H 1 %define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.c b/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.c index 62f0db7d0c7..cba8a3d5809 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=x86_64-darwin9-gcc --enable-pic --enable-realtime-only --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.h b/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.h index 72f336395ce..27d91efdbd4 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/mac/x64/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 1 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h index b5df5e08c57..d93c56eb765 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h @@ -18,6 +18,45 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_sse2 + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_mmx(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_sse2 + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_sse2 + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_sse2 + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_sse2 + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -251,6 +290,68 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_sse2 + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_sse2 + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_sse2 + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_sse2 + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_sse2 + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_sse2 + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_sse2 + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_sse2 + void vpx_dsp_rtcd(void); #ifdef RTCD_C @@ -261,6 +362,10 @@ static void setup_rtcd_internal(void) (void)flags; + vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; vpx_sad16x16x3 = vpx_sad16x16x3_c; if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3; if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3; @@ -307,6 +412,16 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3; vpx_sad8x8x8 = vpx_sad8x8x8_c; if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; } #endif diff --git a/chromium/third_party/libvpx/source/config/nacl/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vp8_rtcd.h index 1bd7496fe55..7a33ed2effb 100644 --- a/chromium/third_party/libvpx/source/config/nacl/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/nacl/vp8_rtcd.h @@ -107,12 +107,6 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_full_search_sad vp8_full_search_sad_c -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_c - -unsigned int vp8_get_mb_ss_c(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_c - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_c @@ -152,9 +146,6 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,in int vp8_mbuverror_c(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_c -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_c - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_c @@ -197,9 +188,6 @@ void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int y void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); #define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); #define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_c @@ -224,21 +212,6 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); #define vp8_subtract_mby vp8_subtract_mby_c -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_c - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_c - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_c - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_c - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_c - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c diff --git a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h index 84bd45f2876..8cf86073577 100644 --- a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h @@ -245,15 +245,6 @@ int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, i void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get16x16var vp9_get16x16var_c - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get8x8var vp9_get8x8var_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_c - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c @@ -368,18 +359,6 @@ void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, int rows, int cols, int fli void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vp9_minmax_8x8 vp9_minmax_8x8_c -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x16 vp9_mse16x16_c - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); #define vp9_plane_add_noise vp9_plane_add_noise_c @@ -509,45 +488,6 @@ void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_c - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_c - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_c - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_c - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_c - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_c - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_c - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); #define vp9_vector_var vp9_vector_var_c diff --git a/chromium/third_party/libvpx/source/config/nacl/vpx_config.asm b/chromium/third_party/libvpx/source/config/nacl/vpx_config.asm index 20db06465bf..a2a18fdc7fd 100644 --- a/chromium/third_party/libvpx/source/config/nacl/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/nacl/vpx_config.asm @@ -23,7 +23,6 @@ .equ HAVE_AVX2 , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_STDINT_H , 1 -.equ HAVE_ALT_TREE_LAYOUT , 0 .equ HAVE_PTHREAD_H , 1 .equ HAVE_SYS_MMAN_H , 1 .equ HAVE_UNISTD_H , 0 diff --git a/chromium/third_party/libvpx/source/config/nacl/vpx_config.c b/chromium/third_party/libvpx/source/config/nacl/vpx_config.c index 0e8d7265f36..2a18a45e3e3 100644 --- a/chromium/third_party/libvpx/source/config/nacl/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/nacl/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=generic-gnu --enable-pic --enable-realtime-only --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/nacl/vpx_config.h b/chromium/third_party/libvpx/source/config/nacl/vpx_config.h index ec8222e18c2..290508c1c88 100644 --- a/chromium/third_party/libvpx/source/config/nacl/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/nacl/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 0 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 1 #define HAVE_SYS_MMAN_H 1 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h index 86c5b4425dc..f086946da23 100644 --- a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h @@ -18,6 +18,33 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_c + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_c + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_c + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x16 vpx_sad16x16_c @@ -183,6 +210,45 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); #define vpx_sad8x8x8 vpx_sad8x8x8_c +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_c + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_c + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_c + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_c + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_c + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_c + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_c + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_c + void vpx_dsp_rtcd(void); #include "vpx_config.h" diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h index b60e7ac7301..38fe6d360b0 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h @@ -74,10 +74,10 @@ void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); RTCD_EXTERN void (*vp8_clear_system_state)(); -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); @@ -147,15 +147,6 @@ int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd * int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); - -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -RTCD_EXTERN unsigned int (*vp8_get_mb_ss)(const short *); - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_c @@ -218,11 +209,6 @@ int vp8_mbuverror_mmx(struct macroblock *mb); int vp8_mbuverror_xmm(struct macroblock *mb); RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); @@ -290,11 +276,6 @@ void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, in void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_mse16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); @@ -337,31 +318,6 @@ void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsig void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance4x4)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); @@ -448,11 +404,6 @@ static void setup_rtcd_internal(void) vp8_full_search_sad = vp8_full_search_sad_c; if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; - if (flags & HAS_MMX) vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; - vp8_get_mb_ss = vp8_get_mb_ss_c; - if (flags & HAS_MMX) vp8_get_mb_ss = vp8_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp8_get_mb_ss = vp8_get_mb_ss_sse2; vp8_loop_filter_bh = vp8_loop_filter_bh_c; if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; @@ -488,9 +439,6 @@ static void setup_rtcd_internal(void) vp8_mbuverror = vp8_mbuverror_c; if (flags & HAS_MMX) vp8_mbuverror = vp8_mbuverror_mmx; if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_xmm; - vp8_mse16x16 = vp8_mse16x16_c; - if (flags & HAS_MMX) vp8_mse16x16 = vp8_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_mse16x16 = vp8_mse16x16_wmt; vp8_plane_add_noise = vp8_plane_add_noise_c; if (flags & HAS_MMX) vp8_plane_add_noise = vp8_plane_add_noise_mmx; if (flags & HAS_SSE2) vp8_plane_add_noise = vp8_plane_add_noise_wmt; @@ -529,9 +477,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; - vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_wmt; vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; if (flags & HAS_MMX) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; if (flags & HAS_SSE2) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; @@ -558,21 +503,6 @@ static void setup_rtcd_internal(void) vp8_subtract_mby = vp8_subtract_mby_c; if (flags & HAS_MMX) vp8_subtract_mby = vp8_subtract_mby_mmx; if (flags & HAS_SSE2) vp8_subtract_mby = vp8_subtract_mby_sse2; - vp8_variance16x16 = vp8_variance16x16_c; - if (flags & HAS_MMX) vp8_variance16x16 = vp8_variance16x16_mmx; - if (flags & HAS_SSE2) vp8_variance16x16 = vp8_variance16x16_wmt; - vp8_variance16x8 = vp8_variance16x8_c; - if (flags & HAS_MMX) vp8_variance16x8 = vp8_variance16x8_mmx; - if (flags & HAS_SSE2) vp8_variance16x8 = vp8_variance16x8_wmt; - vp8_variance4x4 = vp8_variance4x4_c; - if (flags & HAS_MMX) vp8_variance4x4 = vp8_variance4x4_mmx; - if (flags & HAS_SSE2) vp8_variance4x4 = vp8_variance4x4_wmt; - vp8_variance8x16 = vp8_variance8x16_c; - if (flags & HAS_MMX) vp8_variance8x16 = vp8_variance8x16_mmx; - if (flags & HAS_SSE2) vp8_variance8x16 = vp8_variance8x16_wmt; - vp8_variance8x8 = vp8_variance8x8_c; - if (flags & HAS_MMX) vp8_variance8x8 = vp8_variance8x8_mmx; - if (flags & HAS_SSE2) vp8_variance8x8 = vp8_variance8x8_wmt; vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h index 7425dc281bd..1da20dc1bcc 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h @@ -116,7 +116,8 @@ void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c +void vp9_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -320,19 +321,6 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_mmx(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride); -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -484,23 +472,6 @@ void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); RTCD_EXTERN void (*vp9_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); void vp9_plane_add_noise_wmt(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); RTCD_EXTERN void (*vp9_plane_add_noise)(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); @@ -700,63 +671,6 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl); RTCD_EXTERN int (*vp9_vector_var)(int16_t const *ref, int16_t const *src, const int bwl); @@ -807,6 +721,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_convolve_copy = vp9_convolve_copy_sse2; vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_ssse3; vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; @@ -907,13 +823,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; vp9_fwht4x4 = vp9_fwht4x4_c; if (flags & HAS_MMX) vp9_fwht4x4 = vp9_fwht4x4_mmx; - vp9_get16x16var = vp9_get16x16var_c; - if (flags & HAS_SSE2) vp9_get16x16var = vp9_get16x16var_sse2; - if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; - vp9_get8x8var = vp9_get8x8var_c; - if (flags & HAS_SSE2) vp9_get8x8var = vp9_get8x8var_sse2; - vp9_get_mb_ss = vp9_get_mb_ss_c; - if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; @@ -987,15 +896,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_mbpost_proc_down = vp9_mbpost_proc_down_xmm; vp9_minmax_8x8 = vp9_minmax_8x8_c; if (flags & HAS_SSE2) vp9_minmax_8x8 = vp9_minmax_8x8_sse2; - vp9_mse16x16 = vp9_mse16x16_c; - if (flags & HAS_SSE2) vp9_mse16x16 = vp9_mse16x16_sse2; - if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; - vp9_mse16x8 = vp9_mse16x8_c; - if (flags & HAS_SSE2) vp9_mse16x8 = vp9_mse16x8_sse2; - vp9_mse8x16 = vp9_mse8x16_c; - if (flags & HAS_SSE2) vp9_mse8x16 = vp9_mse8x16_sse2; - vp9_mse8x8 = vp9_mse8x8_c; - if (flags & HAS_SSE2) vp9_mse8x8 = vp9_mse8x8_sse2; vp9_plane_add_noise = vp9_plane_add_noise_c; if (flags & HAS_SSE2) vp9_plane_add_noise = vp9_plane_add_noise_wmt; vp9_post_proc_down_and_across = vp9_post_proc_down_and_across_c; @@ -1106,37 +1006,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE) vp9_v_predictor_4x4 = vp9_v_predictor_4x4_sse; vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c; if (flags & HAS_SSE) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_sse; - vp9_variance16x16 = vp9_variance16x16_c; - if (flags & HAS_SSE2) vp9_variance16x16 = vp9_variance16x16_sse2; - if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; - vp9_variance16x32 = vp9_variance16x32_c; - if (flags & HAS_SSE2) vp9_variance16x32 = vp9_variance16x32_sse2; - vp9_variance16x8 = vp9_variance16x8_c; - if (flags & HAS_SSE2) vp9_variance16x8 = vp9_variance16x8_sse2; - vp9_variance32x16 = vp9_variance32x16_c; - if (flags & HAS_SSE2) vp9_variance32x16 = vp9_variance32x16_sse2; - if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; - vp9_variance32x32 = vp9_variance32x32_c; - if (flags & HAS_SSE2) vp9_variance32x32 = vp9_variance32x32_sse2; - if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; - vp9_variance32x64 = vp9_variance32x64_c; - if (flags & HAS_SSE2) vp9_variance32x64 = vp9_variance32x64_sse2; - vp9_variance4x4 = vp9_variance4x4_c; - if (flags & HAS_SSE2) vp9_variance4x4 = vp9_variance4x4_sse2; - vp9_variance4x8 = vp9_variance4x8_c; - if (flags & HAS_SSE2) vp9_variance4x8 = vp9_variance4x8_sse2; - vp9_variance64x32 = vp9_variance64x32_c; - if (flags & HAS_SSE2) vp9_variance64x32 = vp9_variance64x32_sse2; - if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; - vp9_variance64x64 = vp9_variance64x64_c; - if (flags & HAS_SSE2) vp9_variance64x64 = vp9_variance64x64_sse2; - if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; - vp9_variance8x16 = vp9_variance8x16_c; - if (flags & HAS_SSE2) vp9_variance8x16 = vp9_variance8x16_sse2; - vp9_variance8x4 = vp9_variance8x4_c; - if (flags & HAS_SSE2) vp9_variance8x4 = vp9_variance8x4_sse2; - vp9_variance8x8 = vp9_variance8x8_c; - if (flags & HAS_SSE2) vp9_variance8x8 = vp9_variance8x8_sse2; vp9_vector_var = vp9_vector_var_c; if (flags & HAS_SSE2) vp9_vector_var = vp9_vector_var_sse2; } diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.asm b/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.asm index 2509394a18d..c817164acd9 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.asm @@ -20,7 +20,6 @@ %define HAVE_AVX2 1 %define HAVE_VPX_PORTS 1 %define HAVE_STDINT_H 0 -%define HAVE_ALT_TREE_LAYOUT 0 %define HAVE_PTHREAD_H 0 %define HAVE_SYS_MMAN_H 0 %define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.c b/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.c index b2271e28dac..d88b6d68609 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=x86-win32-vs12 --enable-realtime-only --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.h b/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.h index fa8997e86c1..e5c65cf5956 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/win/ia32/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 1 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 0 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 0 #define HAVE_SYS_MMAN_H 0 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h index 8cc2fa5b737..32ee77e25ce 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h @@ -18,6 +18,45 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_mmx(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *); + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -251,6 +290,68 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + void vpx_dsp_rtcd(void); #ifdef RTCD_C @@ -261,6 +362,25 @@ static void setup_rtcd_internal(void) (void)flags; + vpx_get16x16var = vpx_get16x16var_c; + if (flags & HAS_SSE2) vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_get8x8var = vpx_get8x8var_c; + if (flags & HAS_MMX) vpx_get8x8var = vpx_get8x8var_mmx; + if (flags & HAS_SSE2) vpx_get8x8var = vpx_get8x8var_sse2; + vpx_get_mb_ss = vpx_get_mb_ss_c; + if (flags & HAS_MMX) vpx_get_mb_ss = vpx_get_mb_ss_mmx; + if (flags & HAS_SSE2) vpx_get_mb_ss = vpx_get_mb_ss_sse2; + vpx_mse16x16 = vpx_mse16x16_c; + if (flags & HAS_MMX) vpx_mse16x16 = vpx_mse16x16_mmx; + if (flags & HAS_SSE2) vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_c; + if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2; + vpx_mse8x16 = vpx_mse8x16_c; + if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2; + vpx_mse8x8 = vpx_mse8x8_c; + if (flags & HAS_SSE2) vpx_mse8x8 = vpx_mse8x8_sse2; vpx_sad16x16 = vpx_sad16x16_c; if (flags & HAS_MMX) vpx_sad16x16 = vpx_sad16x16_mmx; if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2; @@ -378,6 +498,42 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2; vpx_sad8x8x8 = vpx_sad8x8x8_c; if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + vpx_variance16x16 = vpx_variance16x16_c; + if (flags & HAS_MMX) vpx_variance16x16 = vpx_variance16x16_mmx; + if (flags & HAS_SSE2) vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_c; + if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2; + vpx_variance16x8 = vpx_variance16x8_c; + if (flags & HAS_MMX) vpx_variance16x8 = vpx_variance16x8_mmx; + if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2; + vpx_variance32x16 = vpx_variance32x16_c; + if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_c; + if (flags & HAS_SSE2) vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_c; + if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2; + vpx_variance4x4 = vpx_variance4x4_c; + if (flags & HAS_MMX) vpx_variance4x4 = vpx_variance4x4_mmx; + if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2; + vpx_variance4x8 = vpx_variance4x8_c; + if (flags & HAS_SSE2) vpx_variance4x8 = vpx_variance4x8_sse2; + vpx_variance64x32 = vpx_variance64x32_c; + if (flags & HAS_SSE2) vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_c; + if (flags & HAS_SSE2) vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_c; + if (flags & HAS_MMX) vpx_variance8x16 = vpx_variance8x16_mmx; + if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + vpx_variance8x4 = vpx_variance8x4_c; + if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + vpx_variance8x8 = vpx_variance8x8_c; + if (flags & HAS_MMX) vpx_variance8x8 = vpx_variance8x8_mmx; + if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; } #endif diff --git a/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h index 9989eb09995..5285ac71055 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h @@ -74,10 +74,10 @@ void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); #define vp8_clear_system_state vpx_reset_mmx_state -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); @@ -147,15 +147,6 @@ int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd * int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx - -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_sse2 - void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); #define vp8_intra4x4_predict vp8_intra4x4_predict_c @@ -218,11 +209,6 @@ int vp8_mbuverror_mmx(struct macroblock *mb); int vp8_mbuverror_xmm(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_xmm -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_wmt - void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); @@ -290,11 +276,6 @@ void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, in void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt - unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); @@ -337,31 +318,6 @@ void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsig void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); #define vp8_subtract_mby vp8_subtract_mby_sse2 -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_wmt - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_wmt - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_wmt - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_wmt - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_wmt - unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); diff --git a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h index 10a6b8401db..0834e762f34 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h @@ -116,7 +116,8 @@ void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c +void vp9_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -321,19 +322,6 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_mmx(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_mmx -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get8x8var vp9_get8x8var_sse2 - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_sse2 - void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -488,23 +476,6 @@ void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vp9_minmax_8x8 vp9_minmax_8x8_sse2 -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_sse2 - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_sse2 - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_sse2 - void vp9_plane_add_noise_c(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); void vp9_plane_add_noise_wmt(uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch); #define vp9_plane_add_noise vp9_plane_add_noise_wmt @@ -709,63 +680,6 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_sse2 - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_sse2 - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_sse2 - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_sse2 - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_sse2 - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_sse2 - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_sse2 - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_sse2 - int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl); int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl); #define vp9_vector_var vp9_vector_var_sse2 @@ -799,6 +713,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vp9_convolve8_vert = vp9_convolve8_vert_avx2; vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_32x32 = vp9_d153_predictor_32x32_ssse3; vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; @@ -838,8 +754,6 @@ static void setup_rtcd_internal(void) vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - vp9_get16x16var = vp9_get16x16var_sse2; - if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; @@ -856,8 +770,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vp9_idct8x8_64_add = vp9_idct8x8_64_add_ssse3; vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_sse2; if (flags & HAS_AVX2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_avx2; - vp9_mse16x16 = vp9_mse16x16_sse2; - if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; vp9_quantize_b = vp9_quantize_b_sse2; if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3; vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c; @@ -922,16 +834,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; - vp9_variance16x16 = vp9_variance16x16_sse2; - if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; - vp9_variance32x16 = vp9_variance32x16_sse2; - if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; - vp9_variance32x32 = vp9_variance32x32_sse2; - if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; - vp9_variance64x32 = vp9_variance64x32_sse2; - if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; - vp9_variance64x64 = vp9_variance64x64_sse2; - if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; } #endif diff --git a/chromium/third_party/libvpx/source/config/win/x64/vpx_config.asm b/chromium/third_party/libvpx/source/config/win/x64/vpx_config.asm index b2d9a3776f4..27abdda9cb1 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vpx_config.asm +++ b/chromium/third_party/libvpx/source/config/win/x64/vpx_config.asm @@ -20,7 +20,6 @@ %define HAVE_AVX2 1 %define HAVE_VPX_PORTS 1 %define HAVE_STDINT_H 0 -%define HAVE_ALT_TREE_LAYOUT 0 %define HAVE_PTHREAD_H 0 %define HAVE_SYS_MMAN_H 0 %define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/win/x64/vpx_config.c b/chromium/third_party/libvpx/source/config/win/x64/vpx_config.c index 67a4c84a4d2..6dd3005d3c5 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/win/x64/vpx_config.c @@ -5,5 +5,6 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" static const char* const cfg = "--target=x86_64-win64-vs12 --enable-realtime-only --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/win/x64/vpx_config.h b/chromium/third_party/libvpx/source/config/win/x64/vpx_config.h index 5e6837cbe9a..d97def150e8 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vpx_config.h +++ b/chromium/third_party/libvpx/source/config/win/x64/vpx_config.h @@ -32,7 +32,6 @@ #define HAVE_AVX2 1 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 0 -#define HAVE_ALT_TREE_LAYOUT 0 #define HAVE_PTHREAD_H 0 #define HAVE_SYS_MMAN_H 0 #define HAVE_UNISTD_H 0 diff --git a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h index b5df5e08c57..d93c56eb765 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h @@ -18,6 +18,45 @@ extern "C" { #endif +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_sse2 + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_mmx(const int16_t *); +unsigned int vpx_get_mb_ss_sse2(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_sse2 + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_sse2 + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_sse2 + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_sse2 + unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -251,6 +290,68 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_sse2 + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_sse2 + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_sse2 + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_sse2 + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_sse2 + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_sse2 + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_sse2 + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_sse2 + void vpx_dsp_rtcd(void); #ifdef RTCD_C @@ -261,6 +362,10 @@ static void setup_rtcd_internal(void) (void)flags; + vpx_get16x16var = vpx_get16x16var_sse2; + if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_mse16x16 = vpx_mse16x16_sse2; + if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; vpx_sad16x16x3 = vpx_sad16x16x3_c; if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3; if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3; @@ -307,6 +412,16 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3; vpx_sad8x8x8 = vpx_sad8x8x8_c; if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + vpx_variance16x16 = vpx_variance16x16_sse2; + if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance32x16 = vpx_variance32x16_sse2; + if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; + vpx_variance32x32 = vpx_variance32x32_sse2; + if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance64x32 = vpx_variance64x32_sse2; + if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; + vpx_variance64x64 = vpx_variance64x64_sse2; + if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; } #endif diff --git a/chromium/third_party/libvpx/source/libvpx/CHANGELOG b/chromium/third_party/libvpx/source/libvpx/CHANGELOG index a318784150a..b0d306442b3 100644 --- a/chromium/third_party/libvpx/source/libvpx/CHANGELOG +++ b/chromium/third_party/libvpx/source/libvpx/CHANGELOG @@ -1,3 +1,8 @@ +xxxx-yy-zz v1.4.0 "Changes for next release" + vpxenc is changed to use VP9 by default. + Encoder controls added for 1 pass SVC. + Decoder control to toggle on/off loopfilter. + 2015-04-03 v1.4.0 "Indian Runner Duck" This release includes significant improvements to the VP9 codec. diff --git a/chromium/third_party/libvpx/source/libvpx/README b/chromium/third_party/libvpx/source/libvpx/README index fcd1c2e18cf..7b44ba7a22a 100644 --- a/chromium/third_party/libvpx/source/libvpx/README +++ b/chromium/third_party/libvpx/source/libvpx/README @@ -101,13 +101,6 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-win64-vs10 x86_64-win64-vs11 x86_64-win64-vs12 - universal-darwin8-gcc - universal-darwin9-gcc - universal-darwin10-gcc - universal-darwin11-gcc - universal-darwin12-gcc - universal-darwin13-gcc - universal-darwin14-gcc generic-gnu The generic-gnu target, in conjunction with the CROSS environment variable, diff --git a/chromium/third_party/libvpx/source/libvpx/args.c b/chromium/third_party/libvpx/source/libvpx/args.c index 9dabc9bdd6d..14b031040a4 100644 --- a/chromium/third_party/libvpx/source/libvpx/args.c +++ b/chromium/third_party/libvpx/source/libvpx/args.c @@ -14,9 +14,7 @@ #include <limits.h> #include "args.h" -#ifdef _MSC_VER -#define snprintf _snprintf -#endif +#include "vpx_ports/msvc.h" #if defined(__GNUC__) && __GNUC__ extern void die(const char *fmt, ...) __attribute__((noreturn)); diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk index 0add523f99d..e971c9d1c45 100644 --- a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk +++ b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk @@ -163,6 +163,7 @@ ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes) endif # Add a dependency to force generation of the RTCD files. +define rtcd_dep_template ifeq ($(CONFIG_VP8), yes) $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vp8_rtcd.h endif @@ -175,6 +176,9 @@ $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_dsp_rtcd.h ifeq ($(TARGET_ARCH_ABI),x86) $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_config.asm endif +endef + +$(eval $(call rtcd_dep_template)) .PHONY: clean clean: diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile index fc7749a5519..f1b1cca33bf 100644 --- a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile +++ b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile @@ -22,8 +22,10 @@ clean:: .DEFAULT exampletest: .DEFAULT install:: .DEFAULT test:: .DEFAULT +test-no-data-check:: .DEFAULT testdata:: .DEFAULT utiltest: .DEFAULT +exampletest-no-data-check utiltest-no-data-check: .DEFAULT # Note: md5sum is not installed on OS X, but openssl is. Openssl may not be @@ -56,13 +58,10 @@ dist: fi endif +# Since we invoke make recursively for multiple targets we need to include the +# .mk file for the correct target, but only when $(target) is non-empty. ifneq ($(target),) -# Normally, we want to build the filename from the target and the toolchain. -# This disambiguates from the $(target).mk file that exists in the source tree. -# However, the toolchain is part of the target in universal builds, so we -# don't want to include TOOLCHAIN in that case. FAT_ARCHS is used to test -# if we're in the universal case. -include $(target)$(if $(FAT_ARCHS),,-$(TOOLCHAIN)).mk +include $(target)-$(TOOLCHAIN).mk endif BUILD_ROOT?=. VPATH=$(SRC_PATH_BARE) @@ -116,6 +115,9 @@ test:: testdata:: .PHONY: utiltest utiltest: +.PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check +test-no-data-check:: +exampletest-no-data-check utiltest-no-data-check: # Add compiler flags for intrinsic files ifeq ($(TOOLCHAIN), x86-os2-gcc) @@ -313,18 +315,15 @@ $(1): $$(filter %.o,$$^) $$(extralibs) endef - - -define lipo_lib_template -$(1): $(addsuffix /$(1),$(FAT_ARCHS)) - $(if $(quiet),@echo " [LIPO] $$@") - $(qexec)libtool -static -o $$@ $$? -endef - -define lipo_bin_template -$(1): $(addsuffix /$(1),$(FAT_ARCHS)) - $(if $(quiet),@echo " [LIPO] $$@") - $(qexec)lipo -output $$@ -create $$? +define dll_template +# Not using a pattern rule here because we don't want to generate empty +# archives when they are listed as a dependency in files not responsible +# for creating them. +$(1): + $(if $(quiet),@echo " [LD] $$@") + $(qexec)$$(LD) -Zdll $$(LDFLAGS) \ + -o $$@ \ + $$(filter %.o,$$^) $$(extralibs) $$(EXPORTS_FILE) endef @@ -385,6 +384,7 @@ LIBS=$(call enabled,LIBS) $(foreach lib,$(filter %_g.a,$(LIBS)),$(eval $(call archive_template,$(lib)))) $(foreach lib,$(filter %so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib)))) $(foreach lib,$(filter %$(SO_VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib)))) +$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dll,$(LIBS)),$(eval $(call dll_template,$(lib)))) INSTALL-LIBS=$(call cond_enabled,CONFIG_INSTALL_LIBS,INSTALL-LIBS) ifeq ($(MAKECMDGOALS),dist) diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh index 68cc8bb4a5d..688fa12c52a 100644 --- a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh +++ b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh @@ -390,7 +390,7 @@ write_common_config_banner() { write_common_config_targets() { for t in ${all_targets}; do if enabled ${t}; then - if enabled universal || enabled child; then + if enabled child; then fwrite config.mk "ALL_TARGETS += ${t}-${toolchain}" else fwrite config.mk "ALL_TARGETS += ${t}" @@ -647,14 +647,6 @@ process_common_toolchain() { # detect tgt_os case "$gcctarget" in - *darwin8*) - tgt_isa=universal - tgt_os=darwin8 - ;; - *darwin9*) - tgt_isa=universal - tgt_os=darwin9 - ;; *darwin10*) tgt_isa=x86_64 tgt_os=darwin10 @@ -736,6 +728,13 @@ process_common_toolchain() { # Handle darwin variants. Newer SDKs allow targeting older # platforms, so use the newest one available. case ${toolchain} in + arm*-darwin*) + ios_sdk_dir="$(show_darwin_sdk_path iphoneos)" + if [ -d "${ios_sdk_dir}" ]; then + add_cflags "-isysroot ${ios_sdk_dir}" + add_ldflags "-isysroot ${ios_sdk_dir}" + fi + ;; *-darwin*) osx_sdk_dir="$(show_darwin_sdk_path macosx)" if [ -d "${osx_sdk_dir}" ]; then @@ -811,7 +810,14 @@ process_common_toolchain() { if disabled neon && enabled neon_asm; then die "Disabling neon while keeping neon-asm is not supported" fi - soft_enable media + case ${toolchain} in + *-darwin*) + # Neon is guaranteed on iOS 6+ devices, while old media extensions + # no longer assemble with iOS 9 SDK + ;; + *) + soft_enable media + esac ;; armv6) soft_enable media @@ -1215,7 +1221,7 @@ EOF ;; esac ;; - universal*|*-gcc|generic-gnu) + *-gcc|generic-gnu) link_with_cc=gcc enable_feature gcc setup_gnu_toolchain diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh index 643ebd634be..b653651030e 100755 --- a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh +++ b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh @@ -263,8 +263,8 @@ case "$target" in ;; arm*) platforms[0]="ARM" - asm_Debug_cmdline="armasm -nologo "%(FullPath)"" - asm_Release_cmdline="armasm -nologo "%(FullPath)"" + asm_Debug_cmdline="armasm -nologo -oldit "%(FullPath)"" + asm_Release_cmdline="armasm -nologo -oldit "%(FullPath)"" ;; *) die "Unsupported target $target!" ;; diff --git a/chromium/third_party/libvpx/source/libvpx/configure b/chromium/third_party/libvpx/source/libvpx/configure index 98542855a3e..6cac15aeea5 100755 --- a/chromium/third_party/libvpx/source/libvpx/configure +++ b/chromium/third_party/libvpx/source/libvpx/configure @@ -148,13 +148,6 @@ all_platforms="${all_platforms} x86_64-win64-vs9" all_platforms="${all_platforms} x86_64-win64-vs10" all_platforms="${all_platforms} x86_64-win64-vs11" all_platforms="${all_platforms} x86_64-win64-vs12" -all_platforms="${all_platforms} universal-darwin8-gcc" -all_platforms="${all_platforms} universal-darwin9-gcc" -all_platforms="${all_platforms} universal-darwin10-gcc" -all_platforms="${all_platforms} universal-darwin11-gcc" -all_platforms="${all_platforms} universal-darwin12-gcc" -all_platforms="${all_platforms} universal-darwin13-gcc" -all_platforms="${all_platforms} universal-darwin14-gcc" all_platforms="${all_platforms} generic-gnu" # all_targets is a list of all targets that can be configured @@ -191,6 +184,10 @@ if [ ${doxy_major:-0} -ge 1 ]; then [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen fi +# disable codecs when their source directory does not exist +[ -d "${source_path}/vp8" ] || disable_feature vp8 +[ -d "${source_path}/vp9" ] || disable_feature vp9 + # install everything except the sources, by default. sources will have # to be enabled when doing dist builds, since that's no longer a common # case. @@ -206,31 +203,16 @@ enable_feature multithread enable_feature os_support enable_feature temporal_denoising -[ -d "${source_path}/../include" ] && enable_feature alt_tree_layout -for d in vp8 vp9; do - [ -d "${source_path}/${d}" ] && disable_feature alt_tree_layout; -done - -if ! enabled alt_tree_layout; then -# development environment -[ -d "${source_path}/vp8" ] && CODECS="${CODECS} vp8_encoder vp8_decoder" -[ -d "${source_path}/vp9" ] && CODECS="${CODECS} vp9_encoder vp9_decoder" -else -# customer environment -[ -f "${source_path}/../include/vpx/vp8cx.h" ] && CODECS="${CODECS} vp8_encoder" -[ -f "${source_path}/../include/vpx/vp8dx.h" ] && CODECS="${CODECS} vp8_decoder" -[ -f "${source_path}/../include/vpx/vp9cx.h" ] && CODECS="${CODECS} vp9_encoder" -[ -f "${source_path}/../include/vpx/vp9dx.h" ] && CODECS="${CODECS} vp9_decoder" -[ -f "${source_path}/../include/vpx/vp8cx.h" ] || disable_feature vp8_encoder -[ -f "${source_path}/../include/vpx/vp8dx.h" ] || disable_feature vp8_decoder -[ -f "${source_path}/../include/vpx/vp9cx.h" ] || disable_feature vp9_encoder -[ -f "${source_path}/../include/vpx/vp9dx.h" ] || disable_feature vp9_decoder - -[ -f "${source_path}/../lib/*/*mt.lib" ] && soft_enable static_msvcrt -fi - -CODECS="$(echo ${CODECS} | tr ' ' '\n')" -CODEC_FAMILIES="$(for c in ${CODECS}; do echo ${c%_*}; done | sort | uniq)" +CODECS=" + vp8_encoder + vp8_decoder + vp9_encoder + vp9_decoder +" +CODEC_FAMILIES=" + vp8 + vp9 +" ARCH_LIST=" arm @@ -262,7 +244,6 @@ HAVE_LIST=" ${ARCH_EXT_LIST} vpx_ports stdint_h - alt_tree_layout pthread_h sys_mman_h unistd_h @@ -436,22 +417,8 @@ post_process_cmdline() { process_targets() { enabled child || write_common_config_banner - enabled universal || write_common_target_config_h ${BUILD_PFX}vpx_config.h - - # For fat binaries, call configure recursively to configure for each - # binary architecture to be included. - if enabled universal; then - # Call configure (ourselves) for each subarchitecture - for arch in $fat_bin_archs; do - BUILD_PFX=${arch}/ toolchain=${arch} $self --child $cmdline_args || exit $? - done - fi - - # The write_common_config (config.mk) logic is deferred until after the - # recursive calls to configure complete, because we want our universal - # targets to be executed last. + write_common_target_config_h ${BUILD_PFX}vpx_config.h write_common_config_targets - enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk # Calculate the default distribution name, based on the enabled features cf="" @@ -527,11 +494,11 @@ process_detect() { # Can only build shared libs on a subset of platforms. Doing this check # here rather than at option parse time because the target auto-detect # magic happens after the command line has been parsed. - if ! enabled linux; then + if ! enabled linux && ! enabled os2; then if enabled gnu; then echo "--enable-shared is only supported on ELF; assuming this is OK" else - die "--enable-shared only supported on ELF for now" + die "--enable-shared only supported on ELF and OS/2 for now" fi fi fi @@ -596,24 +563,6 @@ EOF process_toolchain() { process_common_toolchain - # Handle universal binaries for this architecture - case $toolchain in - universal-darwin*) - darwin_ver=${tgt_os##darwin} - - # Tiger (10.4/darwin8) brought support for x86 - if [ $darwin_ver -ge 8 ]; then - fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}" - fi - - # Leopard (10.5/darwin9) brought 64 bit support - if [ $darwin_ver -ge 9 ]; then - fat_bin_archs="$fat_bin_archs x86_64-${tgt_os}-${tgt_cc}" - fi - ;; - esac - - # Enable some useful compiler flags if enabled gcc; then enabled werror && check_add_cflags -Werror @@ -701,7 +650,7 @@ process_toolchain() { esac # Other toolchain specific defaults - case $toolchain in x86*|universal*) soft_enable postproc;; esac + case $toolchain in x86*) soft_enable postproc;; esac if enabled postproc_visualizer; then enabled postproc || die "postproc_visualizer requires postproc to be enabled" @@ -765,6 +714,7 @@ CONFIGURE_ARGS="$@" process "$@" print_webm_license ${BUILD_PFX}vpx_config.c "/*" " */" cat <<EOF >> ${BUILD_PFX}vpx_config.c +#include "vpx/vpx_codec.h" static const char* const cfg = "$CONFIGURE_ARGS"; const char *vpx_codec_build_config(void) {return cfg;} EOF diff --git a/chromium/third_party/libvpx/source/libvpx/examples.mk b/chromium/third_party/libvpx/source/libvpx/examples.mk index 4ff1de4eeae..fad02cfcd2b 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples.mk +++ b/chromium/third_party/libvpx/source/libvpx/examples.mk @@ -56,6 +56,7 @@ UTILS-$(CONFIG_DECODERS) += vpxdec.c vpxdec.SRCS += md5_utils.c md5_utils.h vpxdec.SRCS += vpx_ports/mem_ops.h vpxdec.SRCS += vpx_ports/mem_ops_aligned.h +vpxdec.SRCS += vpx_ports/msvc.h vpxdec.SRCS += vpx_ports/vpx_timer.h vpxdec.SRCS += vpx/vpx_integer.h vpxdec.SRCS += args.c args.h @@ -80,6 +81,7 @@ vpxenc.SRCS += tools_common.c tools_common.h vpxenc.SRCS += warnings.c warnings.h vpxenc.SRCS += vpx_ports/mem_ops.h vpxenc.SRCS += vpx_ports/mem_ops_aligned.h +vpxenc.SRCS += vpx_ports/msvc.h vpxenc.SRCS += vpx_ports/vpx_timer.h vpxenc.SRCS += vpxstats.c vpxstats.h ifeq ($(CONFIG_LIBYUV),yes) @@ -98,6 +100,7 @@ ifeq ($(CONFIG_SPATIAL_SVC),yes) vp9_spatial_svc_encoder.SRCS += tools_common.c tools_common.h vp9_spatial_svc_encoder.SRCS += video_common.h vp9_spatial_svc_encoder.SRCS += video_writer.h video_writer.c + vp9_spatial_svc_encoder.SRCS += vpx_ports/msvc.h vp9_spatial_svc_encoder.SRCS += vpxstats.c vpxstats.h vp9_spatial_svc_encoder.GUID = 4A38598D-627D-4505-9C7B-D4020C84100D vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder @@ -112,6 +115,7 @@ vpx_temporal_svc_encoder.SRCS += ivfenc.c ivfenc.h vpx_temporal_svc_encoder.SRCS += tools_common.c tools_common.h vpx_temporal_svc_encoder.SRCS += video_common.h vpx_temporal_svc_encoder.SRCS += video_writer.h video_writer.c +vpx_temporal_svc_encoder.SRCS += vpx_ports/msvc.h vpx_temporal_svc_encoder.GUID = B18C08F2-A439-4502-A78E-849BE3D60947 vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder EXAMPLES-$(CONFIG_DECODERS) += simple_decoder.c @@ -122,6 +126,7 @@ simple_decoder.SRCS += video_common.h simple_decoder.SRCS += video_reader.h video_reader.c simple_decoder.SRCS += vpx_ports/mem_ops.h simple_decoder.SRCS += vpx_ports/mem_ops_aligned.h +simple_decoder.SRCS += vpx_ports/msvc.h simple_decoder.DESCRIPTION = Simplified decoder loop EXAMPLES-$(CONFIG_DECODERS) += postproc.c postproc.SRCS += ivfdec.h ivfdec.c @@ -130,6 +135,7 @@ postproc.SRCS += video_common.h postproc.SRCS += video_reader.h video_reader.c postproc.SRCS += vpx_ports/mem_ops.h postproc.SRCS += vpx_ports/mem_ops_aligned.h +postproc.SRCS += vpx_ports/msvc.h postproc.GUID = 65E33355-F35E-4088-884D-3FD4905881D7 postproc.DESCRIPTION = Decoder postprocessor control EXAMPLES-$(CONFIG_DECODERS) += decode_to_md5.c @@ -140,6 +146,7 @@ decode_to_md5.SRCS += video_common.h decode_to_md5.SRCS += video_reader.h video_reader.c decode_to_md5.SRCS += vpx_ports/mem_ops.h decode_to_md5.SRCS += vpx_ports/mem_ops_aligned.h +decode_to_md5.SRCS += vpx_ports/msvc.h decode_to_md5.GUID = 59120B9B-2735-4BFE-B022-146CA340FE42 decode_to_md5.DESCRIPTION = Frame by frame MD5 checksum EXAMPLES-$(CONFIG_ENCODERS) += simple_encoder.c @@ -147,6 +154,7 @@ simple_encoder.SRCS += ivfenc.h ivfenc.c simple_encoder.SRCS += tools_common.h tools_common.c simple_encoder.SRCS += video_common.h simple_encoder.SRCS += video_writer.h video_writer.c +simple_encoder.SRCS += vpx_ports/msvc.h simple_encoder.GUID = 4607D299-8A71-4D2C-9B1D-071899B6FBFD simple_encoder.DESCRIPTION = Simplified encoder loop EXAMPLES-$(CONFIG_VP9_ENCODER) += vp9_lossless_encoder.c @@ -154,6 +162,7 @@ vp9_lossless_encoder.SRCS += ivfenc.h ivfenc.c vp9_lossless_encoder.SRCS += tools_common.h tools_common.c vp9_lossless_encoder.SRCS += video_common.h vp9_lossless_encoder.SRCS += video_writer.h video_writer.c +vp9_lossless_encoder.SRCS += vpx_ports/msvc.h vp9_lossless_encoder.GUID = B63C7C88-5348-46DC-A5A6-CC151EF93366 vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder EXAMPLES-$(CONFIG_ENCODERS) += twopass_encoder.c @@ -161,6 +170,7 @@ twopass_encoder.SRCS += ivfenc.h ivfenc.c twopass_encoder.SRCS += tools_common.h tools_common.c twopass_encoder.SRCS += video_common.h twopass_encoder.SRCS += video_writer.h video_writer.c +twopass_encoder.SRCS += vpx_ports/msvc.h twopass_encoder.GUID = 73494FA6-4AF9-4763-8FBB-265C92402FD8 twopass_encoder.DESCRIPTION = Two-pass encoder loop EXAMPLES-$(CONFIG_DECODERS) += decode_with_drops.c @@ -170,6 +180,7 @@ decode_with_drops.SRCS += video_common.h decode_with_drops.SRCS += video_reader.h video_reader.c decode_with_drops.SRCS += vpx_ports/mem_ops.h decode_with_drops.SRCS += vpx_ports/mem_ops_aligned.h +decode_with_drops.SRCS += vpx_ports/msvc.h decode_with_drops.GUID = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26 decode_with_drops.DESCRIPTION = Drops frames while decoding EXAMPLES-$(CONFIG_ENCODERS) += set_maps.c @@ -177,6 +188,7 @@ set_maps.SRCS += ivfenc.h ivfenc.c set_maps.SRCS += tools_common.h tools_common.c set_maps.SRCS += video_common.h set_maps.SRCS += video_writer.h video_writer.c +set_maps.SRCS += vpx_ports/msvc.h set_maps.GUID = ECB2D24D-98B8-4015-A465-A4AF3DCC145F set_maps.DESCRIPTION = Set active and ROI maps EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c @@ -184,6 +196,7 @@ vp8cx_set_ref.SRCS += ivfenc.h ivfenc.c vp8cx_set_ref.SRCS += tools_common.h tools_common.c vp8cx_set_ref.SRCS += video_common.h vp8cx_set_ref.SRCS += video_writer.h video_writer.c +vp8cx_set_ref.SRCS += vpx_ports/msvc.h vp8cx_set_ref.GUID = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame @@ -194,6 +207,7 @@ EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_multi_resolution_encoder.c vp8_multi_resolution_encoder.SRCS += ivfenc.h ivfenc.c vp8_multi_resolution_encoder.SRCS += tools_common.h tools_common.c vp8_multi_resolution_encoder.SRCS += video_writer.h video_writer.c +vp8_multi_resolution_encoder.SRCS += vpx_ports/msvc.h vp8_multi_resolution_encoder.SRCS += $(LIBYUV_SRCS) vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding @@ -254,14 +268,6 @@ CODEC_EXTRA_LIBS=$(sort $(call enabled,CODEC_EXTRA_LIBS)) $(foreach ex,$(ALL_EXAMPLES),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) examples.mk)) -# If this is a universal (fat) binary, then all the subarchitectures have -# already been built and our job is to stitch them together. The -# BUILD_OBJS variable indicates whether we should be building -# (compiling, linking) the library. The LIPO_OBJS variable indicates -# that we're stitching. -$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_OBJS,BUILD_OBJS):=yes) - - # Create build/install dependencies for all examples. The common case # is handled here. The MSVS case is handled below. NOT_MSVS = $(if $(CONFIG_MSVS),,yes) @@ -269,24 +275,28 @@ DIST-BINS-$(NOT_MSVS) += $(addprefix bin/,$(ALL_EXAMPLES:.c=$(EXE_SFX))) INSTALL-BINS-$(NOT_MSVS) += $(addprefix bin/,$(UTILS:.c=$(EXE_SFX))) DIST-SRCS-yes += $(ALL_SRCS) INSTALL-SRCS-yes += $(UTIL_SRCS) -OBJS-$(NOT_MSVS) += $(if $(BUILD_OBJS),$(call objs,$(ALL_SRCS))) +OBJS-$(NOT_MSVS) += $(call objs,$(ALL_SRCS)) BINS-$(NOT_MSVS) += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=$(EXE_SFX))) # Instantiate linker template for all examples. CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx) -SHARED_LIB_SUF=$(if $(filter darwin%,$(TGT_OS)),.dylib,.so) +ifneq ($(filter darwin%,$(TGT_OS)),) +SHARED_LIB_SUF=.dylib +else +ifneq ($(filter os2%,$(TGT_OS)),) +SHARED_LIB_SUF=_dll.a +else +SHARED_LIB_SUF=.so +endif +endif CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a) $(foreach bin,$(BINS-yes),\ - $(if $(BUILD_OBJS),$(eval $(bin):\ - $(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF)))\ - $(if $(BUILD_OBJS),$(eval $(call linker_template,$(bin),\ + $(eval $(bin):$(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF))\ + $(eval $(call linker_template,$(bin),\ $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \ -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\ - )))\ - $(if $(LIPO_OBJS),$(eval $(call lipo_bin_template,$(bin))))\ - ) - + ))) # The following pairs define a mapping of locations in the distribution # tree to locations in the source/build trees. diff --git a/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c b/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c index a3843bed336..1ae7a4b57f5 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c @@ -71,7 +71,7 @@ static void print_md5(FILE *stream, unsigned char digest[16]) { static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name); exit(EXIT_FAILURE); } diff --git a/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c b/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c index 36f7d80e127..2233e473d36 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c @@ -65,7 +65,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <infile> <outfile> <N-M|N/M>\n", exec_name); exit(EXIT_FAILURE); } diff --git a/chromium/third_party/libvpx/source/libvpx/examples/postproc.c b/chromium/third_party/libvpx/source/libvpx/examples/postproc.c index e34426a6194..a8ac208d9bd 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/postproc.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/postproc.c @@ -52,7 +52,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name); exit(EXIT_FAILURE); } diff --git a/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c b/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c index f8c35255fa2..e6fdd5bb2af 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c @@ -15,6 +15,7 @@ #include <stdlib.h> #include <string.h> +#include "../tools_common.h" #include "../vp9/encoder/vp9_resize.h" static const char *exec_name = NULL; @@ -26,7 +27,7 @@ static void usage() { printf("<output_yuv> [<frames>]\n"); } -void usage_exit() { +void usage_exit(void) { usage(); exit(EXIT_FAILURE); } diff --git a/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c b/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c index 5555baac22e..1dc3ac0c98f 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c @@ -55,7 +55,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n", exec_name); exit(EXIT_FAILURE); diff --git a/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c b/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c index 08a21668542..8ccc81035e3 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c @@ -88,7 +88,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name); exit(EXIT_FAILURE); } diff --git a/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c index e805c258747..a3077297318 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c @@ -106,7 +106,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile> " "<keyframe-interval> [<error-resilient>]\nSee comments in " diff --git a/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c index 0ec83ddccdf..aecc11d3f4e 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c @@ -58,7 +58,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n", exec_name); exit(EXIT_FAILURE); diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c index e623567b8fe..2b032049c0b 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c @@ -37,15 +37,14 @@ #include <unistd.h> #endif #include "vpx_ports/vpx_timer.h" -#define VPX_CODEC_DISABLE_COMPAT 1 #include "vpx/vpx_encoder.h" #include "vpx/vp8cx.h" #include "vpx_ports/mem_ops.h" -#include "./tools_common.h" +#include "../tools_common.h" #define interface (vpx_codec_vp8_cx()) #define fourcc 0x30385056 -void usage_exit() { +void usage_exit(void) { exit(EXIT_FAILURE); } diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c b/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c index a2982821a42..8b4cc303d3c 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c @@ -58,7 +58,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile> <frame>\n", exec_name); exit(EXIT_FAILURE); diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c index 54275770d5f..82725168304 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c @@ -20,7 +20,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "vp9_lossless_encoder: Example demonstrating VP9 lossless " "encoding feature. Supports raw input only.\n"); fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name); diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c index f4deb693b2f..5a609766561 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c @@ -14,11 +14,13 @@ * that benefit from a scalable bitstream. */ +#include <math.h> #include <stdarg.h> #include <stdlib.h> #include <string.h> #include <time.h> + #include "../args.h" #include "../tools_common.h" #include "../video_writer.h" @@ -27,11 +29,18 @@ #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" #include "../vpxstats.h" +#define OUTPUT_RC_STATS 1 static const arg_def_t skip_frames_arg = ARG_DEF("s", "skip-frames", 1, "input frames to skip"); static const arg_def_t frames_arg = ARG_DEF("f", "frames", 1, "number of frames to encode"); +static const arg_def_t threads_arg = + ARG_DEF("th", "threads", 1, "number of threads to use"); +#if OUTPUT_RC_STATS +static const arg_def_t output_rc_stats_arg = + ARG_DEF("rcstat", "output_rc_stats", 1, "output rc stats"); +#endif static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "source width"); static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "source height"); static const arg_def_t timebase_arg = @@ -42,6 +51,9 @@ static const arg_def_t spatial_layers_arg = ARG_DEF("sl", "spatial-layers", 1, "number of spatial SVC layers"); static const arg_def_t temporal_layers_arg = ARG_DEF("tl", "temporal-layers", 1, "number of temporal SVC layers"); +static const arg_def_t temporal_layering_mode_arg = + ARG_DEF("tlm", "temporal-layering-mode", 1, "temporal layering scheme." + "VP9E_TEMPORAL_LAYERING_MODE"); static const arg_def_t kf_dist_arg = ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes"); static const arg_def_t scale_factors_arg = @@ -65,6 +77,8 @@ static const arg_def_t lag_in_frame_arg = "generating any outputs"); static const arg_def_t rc_end_usage_arg = ARG_DEF(NULL, "rc-end-usage", 1, "0 - 3: VBR, CBR, CQ, Q"); +static const arg_def_t speed_arg = + ARG_DEF("sp", "speed", 1, "speed configuration"); #if CONFIG_VP9_HIGHBITDEPTH static const struct arg_enum_list bitdepth_enum[] = { @@ -85,10 +99,16 @@ static const arg_def_t *svc_args[] = { &timebase_arg, &bitrate_arg, &skip_frames_arg, &spatial_layers_arg, &kf_dist_arg, &scale_factors_arg, &passes_arg, &pass_arg, &fpf_name_arg, &min_q_arg, &max_q_arg, &min_bitrate_arg, - &max_bitrate_arg, &temporal_layers_arg, &lag_in_frame_arg, + &max_bitrate_arg, &temporal_layers_arg, &temporal_layering_mode_arg, + &lag_in_frame_arg, &threads_arg, +#if OUTPUT_RC_STATS + &output_rc_stats_arg, +#endif + #if CONFIG_VP9_HIGHBITDEPTH &bitdepth_arg, #endif + &speed_arg, &rc_end_usage_arg, NULL }; @@ -102,6 +122,10 @@ static const uint32_t default_bitrate = 1000; static const uint32_t default_spatial_layers = 5; static const uint32_t default_temporal_layers = 1; static const uint32_t default_kf_dist = 100; +static const uint32_t default_temporal_layering_mode = 0; +static const uint32_t default_output_rc_stats = 0; +static const int32_t default_speed = -1; // -1 means use library default. +static const uint32_t default_threads = 0; // zero means use library default. typedef struct { const char *input_filename; @@ -116,7 +140,7 @@ typedef struct { static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <options> input_filename output_filename\n", exec_name); fprintf(stderr, "Options:\n"); @@ -143,6 +167,12 @@ static void parse_command_line(int argc, const char **argv_, svc_ctx->log_level = SVC_LOG_DEBUG; svc_ctx->spatial_layers = default_spatial_layers; svc_ctx->temporal_layers = default_temporal_layers; + svc_ctx->temporal_layering_mode = default_temporal_layering_mode; +#if OUTPUT_RC_STATS + svc_ctx->output_rc_stat = default_output_rc_stats; +#endif + svc_ctx->speed = default_speed; + svc_ctx->threads = default_threads; // start with default encoder configuration res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0); @@ -184,6 +214,20 @@ static void parse_command_line(int argc, const char **argv_, svc_ctx->spatial_layers = arg_parse_uint(&arg); } else if (arg_match(&arg, &temporal_layers_arg, argi)) { svc_ctx->temporal_layers = arg_parse_uint(&arg); +#if OUTPUT_RC_STATS + } else if (arg_match(&arg, &output_rc_stats_arg, argi)) { + svc_ctx->output_rc_stat = arg_parse_uint(&arg); +#endif + } else if (arg_match(&arg, &speed_arg, argi)) { + svc_ctx->speed = arg_parse_uint(&arg); + } else if (arg_match(&arg, &threads_arg, argi)) { + svc_ctx->threads = arg_parse_uint(&arg); + } else if (arg_match(&arg, &temporal_layering_mode_arg, argi)) { + svc_ctx->temporal_layering_mode = + enc_cfg->temporal_layering_mode = arg_parse_int(&arg); + if (svc_ctx->temporal_layering_mode) { + enc_cfg->g_error_resilient = 1; + } } else if (arg_match(&arg, &kf_dist_arg, argi)) { enc_cfg->kf_min_dist = arg_parse_uint(&arg); enc_cfg->kf_max_dist = enc_cfg->kf_min_dist; @@ -316,6 +360,185 @@ static void parse_command_line(int argc, const char **argv_, enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist); } +#if OUTPUT_RC_STATS +// For rate control encoding stats. +struct RateControlStats { + // Number of input frames per layer. + int layer_input_frames[VPX_MAX_LAYERS]; + // Total (cumulative) number of encoded frames per layer. + int layer_tot_enc_frames[VPX_MAX_LAYERS]; + // Number of encoded non-key frames per layer. + int layer_enc_frames[VPX_MAX_LAYERS]; + // Framerate per layer (cumulative). + double layer_framerate[VPX_MAX_LAYERS]; + // Target average frame size per layer (per-frame-bandwidth per layer). + double layer_pfb[VPX_MAX_LAYERS]; + // Actual average frame size per layer. + double layer_avg_frame_size[VPX_MAX_LAYERS]; + // Average rate mismatch per layer (|target - actual| / target). + double layer_avg_rate_mismatch[VPX_MAX_LAYERS]; + // Actual encoding bitrate per layer (cumulative). + double layer_encoding_bitrate[VPX_MAX_LAYERS]; + // Average of the short-time encoder actual bitrate. + // TODO(marpan): Should we add these short-time stats for each layer? + double avg_st_encoding_bitrate; + // Variance of the short-time encoder actual bitrate. + double variance_st_encoding_bitrate; + // Window (number of frames) for computing short-time encoding bitrate. + int window_size; + // Number of window measurements. + int window_count; +}; + +// Note: these rate control stats assume only 1 key frame in the +// sequence (i.e., first frame only). +static void set_rate_control_stats(struct RateControlStats *rc, + vpx_codec_enc_cfg_t *cfg) { + unsigned int sl, tl; + // Set the layer (cumulative) framerate and the target layer (non-cumulative) + // per-frame-bandwidth, for the rate control encoding stats below. + const double framerate = cfg->g_timebase.den / cfg->g_timebase.num; + + for (sl = 0; sl < cfg->ss_number_layers; ++sl) { + for (tl = 0; tl < cfg->ts_number_layers; ++tl) { + const int layer = sl * cfg->ts_number_layers + tl; + const int tlayer0 = sl * cfg->ts_number_layers; + rc->layer_framerate[layer] = + framerate / cfg->ts_rate_decimator[tl]; + if (tl > 0) { + rc->layer_pfb[layer] = 1000.0 * + (cfg->layer_target_bitrate[layer] - + cfg->layer_target_bitrate[layer - 1]) / + (rc->layer_framerate[layer] - + rc->layer_framerate[layer - 1]); + } else { + rc->layer_pfb[tlayer0] = 1000.0 * + cfg->layer_target_bitrate[tlayer0] / + rc->layer_framerate[tlayer0]; + } + rc->layer_input_frames[layer] = 0; + rc->layer_enc_frames[layer] = 0; + rc->layer_tot_enc_frames[layer] = 0; + rc->layer_encoding_bitrate[layer] = 0.0; + rc->layer_avg_frame_size[layer] = 0.0; + rc->layer_avg_rate_mismatch[layer] = 0.0; + } + } + rc->window_count = 0; + rc->window_size = 15; + rc->avg_st_encoding_bitrate = 0.0; + rc->variance_st_encoding_bitrate = 0.0; +} + +static void printout_rate_control_summary(struct RateControlStats *rc, + vpx_codec_enc_cfg_t *cfg, + int frame_cnt) { + unsigned int sl, tl; + int tot_num_frames = 0; + double perc_fluctuation = 0.0; + printf("Total number of processed frames: %d\n\n", frame_cnt - 1); + printf("Rate control layer stats for sl%d tl%d layer(s):\n\n", + cfg->ss_number_layers, cfg->ts_number_layers); + for (sl = 0; sl < cfg->ss_number_layers; ++sl) { + for (tl = 0; tl < cfg->ts_number_layers; ++tl) { + const int layer = sl * cfg->ts_number_layers + tl; + const int num_dropped = (tl > 0) ? + (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer]) : + (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] - 1); + if (!sl) + tot_num_frames += rc->layer_input_frames[layer]; + rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] * + rc->layer_encoding_bitrate[layer] / tot_num_frames; + rc->layer_avg_frame_size[layer] = rc->layer_avg_frame_size[layer] / + rc->layer_enc_frames[layer]; + rc->layer_avg_rate_mismatch[layer] = + 100.0 * rc->layer_avg_rate_mismatch[layer] / + rc->layer_enc_frames[layer]; + printf("For layer#: sl%d tl%d \n", sl, tl); + printf("Bitrate (target vs actual): %d %f.0 kbps\n", + cfg->layer_target_bitrate[layer], + rc->layer_encoding_bitrate[layer]); + printf("Average frame size (target vs actual): %f %f bits\n", + rc->layer_pfb[layer], rc->layer_avg_frame_size[layer]); + printf("Average rate_mismatch: %f\n", + rc->layer_avg_rate_mismatch[layer]); + printf("Number of input frames, encoded (non-key) frames, " + "and percent dropped frames: %d %d %f.0 \n", + rc->layer_input_frames[layer], rc->layer_enc_frames[layer], + 100.0 * num_dropped / rc->layer_input_frames[layer]); + printf("\n"); + } + } + rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count; + rc->variance_st_encoding_bitrate = + rc->variance_st_encoding_bitrate / rc->window_count - + (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate); + perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) / + rc->avg_st_encoding_bitrate; + printf("Short-time stats, for window of %d frames: \n", rc->window_size); + printf("Average, rms-variance, and percent-fluct: %f %f %f \n", + rc->avg_st_encoding_bitrate, + sqrt(rc->variance_st_encoding_bitrate), + perc_fluctuation); + if (frame_cnt != tot_num_frames) + die("Error: Number of input frames not equal to output encoded frames != " + "%d tot_num_frames = %d\n", frame_cnt, tot_num_frames); +} + +vpx_codec_err_t parse_superframe_index(const uint8_t *data, + size_t data_sz, + uint32_t sizes[8], int *count) { + // A chunk ending with a byte matching 0xc0 is an invalid chunk unless + // it is a super frame index. If the last byte of real video compression + // data is 0xc0 the encoder must add a 0 byte. If we have the marker but + // not the associated matching marker byte at the front of the index we have + // an invalid bitstream and need to return an error. + + uint8_t marker; + + marker = *(data + data_sz - 1); + *count = 0; + + + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) + return VPX_CODEC_CORRUPT_FRAME; + + { + const uint8_t marker2 = *(data + data_sz - index_sz); + + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) + return VPX_CODEC_CORRUPT_FRAME; + } + + { + // Found a valid superframe index. + uint32_t i, j; + const uint8_t *x = &data[data_sz - index_sz + 1]; + + for (i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (j = 0; j < mag; ++j) + this_sz |= (*x++) << (j * 8); + sizes[i] = this_sz; + } + *count = frames; + } + } + return VPX_CODEC_OK; +} +#endif + int main(int argc, const char **argv) { AppInput app_input = {0}; VpxVideoWriter *writer = NULL; @@ -332,7 +555,15 @@ int main(int argc, const char **argv) { FILE *infile = NULL; int end_of_stream = 0; int frames_received = 0; - +#if OUTPUT_RC_STATS + VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL}; + struct RateControlStats rc; + vpx_svc_layer_id_t layer_id; + int sl, tl; + double sum_bitrate = 0.0; + double sum_bitrate2 = 0.0; + double framerate = 30.0; +#endif memset(&svc_ctx, 0, sizeof(svc_ctx)); svc_ctx.log_print = 1; exec_name = argv[0]; @@ -359,6 +590,13 @@ int main(int argc, const char **argv) { VPX_CODEC_OK) die("Failed to initialize encoder\n"); +#if OUTPUT_RC_STATS + if (svc_ctx.output_rc_stat) { + set_rate_control_stats(&rc, &enc_cfg); + framerate = enc_cfg.g_timebase.den / enc_cfg.g_timebase.num; + } +#endif + info.codec_fourcc = VP9_FOURCC; info.time_base.numerator = enc_cfg.g_timebase.num; info.time_base.denominator = enc_cfg.g_timebase.den; @@ -370,11 +608,31 @@ int main(int argc, const char **argv) { if (!writer) die("Failed to open %s for writing\n", app_input.output_filename); } +#if OUTPUT_RC_STATS + // For now, just write temporal layer streams. + // TODO(wonkap): do spatial by re-writing superframe. + if (svc_ctx.output_rc_stat) { + for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) { + char file_name[PATH_MAX]; + + snprintf(file_name, sizeof(file_name), "%s_t%d.ivf", + app_input.output_filename, tl); + outfile[tl] = vpx_video_writer_open(file_name, kContainerIVF, &info); + if (!outfile[tl]) + die("Failed to open %s for writing", file_name); + } + } +#endif // skip initial frames for (i = 0; i < app_input.frames_to_skip; ++i) vpx_img_read(&raw, infile); + if (svc_ctx.speed != -1) + vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed); + if (svc_ctx.threads) + vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1)); + // Encode frames while (!end_of_stream) { vpx_codec_iter_t iter = NULL; @@ -386,7 +644,9 @@ int main(int argc, const char **argv) { } res = vpx_svc_encode(&svc_ctx, &codec, (end_of_stream ? NULL : &raw), - pts, frame_duration, VPX_DL_GOOD_QUALITY); + pts, frame_duration, svc_ctx.speed >= 5 ? + VPX_DL_REALTIME : VPX_DL_GOOD_QUALITY); + printf("%s", vpx_svc_get_message(&svc_ctx)); if (res != VPX_CODEC_OK) { die_codec(&codec, "Failed to encode frame"); @@ -395,11 +655,90 @@ int main(int argc, const char **argv) { while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) { switch (cx_pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: { - if (cx_pkt->data.frame.sz > 0) + if (cx_pkt->data.frame.sz > 0) { +#if OUTPUT_RC_STATS + uint32_t sizes[8]; + int count = 0; +#endif vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, cx_pkt->data.frame.pts); +#if OUTPUT_RC_STATS + // TODO(marpan/wonkap): Put this (to line728) in separate function. + if (svc_ctx.output_rc_stat) { + vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id); + parse_superframe_index(cx_pkt->data.frame.buf, + cx_pkt->data.frame.sz, sizes, &count); + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + + layer_id.temporal_layer_id]; + } + for (tl = layer_id.temporal_layer_id; + tl < enc_cfg.ts_number_layers; ++tl) { + vpx_video_writer_write_frame(outfile[tl], + cx_pkt->data.frame.buf, + cx_pkt->data.frame.sz, + cx_pkt->data.frame.pts); + } + + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + for (tl = layer_id.temporal_layer_id; + tl < enc_cfg.ts_number_layers; ++tl) { + const int layer = sl * enc_cfg.ts_number_layers + tl; + ++rc.layer_tot_enc_frames[layer]; + rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl]; + // Keep count of rate control stats per layer, for non-key + // frames. + if (tl == layer_id.temporal_layer_id && + !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) { + rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl]; + rc.layer_avg_rate_mismatch[layer] += + fabs(8.0 * sizes[sl] - rc.layer_pfb[layer]) / + rc.layer_pfb[layer]; + ++rc.layer_enc_frames[layer]; + } + } + } + + // Update for short-time encoding bitrate states, for moving + // window of size rc->window, shifted by rc->window / 2. + // Ignore first window segment, due to key frame. + if (frame_cnt > rc.window_size) { + tl = layer_id.temporal_layer_id; + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate; + } + if (frame_cnt % rc.window_size == 0) { + rc.window_count += 1; + rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size; + rc.variance_st_encoding_bitrate += + (sum_bitrate / rc.window_size) * + (sum_bitrate / rc.window_size); + sum_bitrate = 0.0; + } + } + + // Second shifted window. + if (frame_cnt > rc.window_size + rc.window_size / 2) { + tl = layer_id.temporal_layer_id; + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate; + } + + if (frame_cnt > 2 * rc.window_size && + frame_cnt % rc.window_size == 0) { + rc.window_count += 1; + rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size; + rc.variance_st_encoding_bitrate += + (sum_bitrate2 / rc.window_size) * + (sum_bitrate2 / rc.window_size); + sum_bitrate2 = 0.0; + } + } + } +#endif + } printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY), @@ -424,25 +763,30 @@ int main(int argc, const char **argv) { pts += frame_duration; } } - printf("Processed %d frames\n", frame_cnt); - fclose(infile); +#if OUTPUT_RC_STATS + if (svc_ctx.output_rc_stat) { + printout_rate_control_summary(&rc, &enc_cfg, frame_cnt); + printf("\n"); + } +#endif if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); - if (app_input.passes == 2) stats_close(&app_input.rc_stats, 1); - if (writer) { vpx_video_writer_close(writer); } - +#if OUTPUT_RC_STATS + if (svc_ctx.output_rc_stat) { + for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) { + vpx_video_writer_close(outfile[tl]); + } + } +#endif vpx_img_free(&raw); - // display average size, psnr printf("%s", vpx_svc_dump_statistics(&svc_ctx)); - vpx_svc_release(&svc_ctx); - return EXIT_SUCCESS; } diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c index 349875997b5..484deb5b360 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c @@ -28,7 +28,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { exit(EXIT_FAILURE); } @@ -70,6 +70,7 @@ struct RateControlMetrics { int window_size; // Number of window measurements. int window_count; + int layer_target_bitrate[VPX_MAX_LAYERS]; }; // Note: these rate control metrics assume only 1 key frame in the @@ -85,13 +86,13 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc, // per-frame-bandwidth, for the rate control encoding stats below. const double framerate = cfg->g_timebase.den / cfg->g_timebase.num; rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0]; - rc->layer_pfb[0] = 1000.0 * cfg->ts_target_bitrate[0] / + rc->layer_pfb[0] = 1000.0 * rc->layer_target_bitrate[0] / rc->layer_framerate[0]; for (i = 0; i < cfg->ts_number_layers; ++i) { if (i > 0) { rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i]; rc->layer_pfb[i] = 1000.0 * - (cfg->ts_target_bitrate[i] - cfg->ts_target_bitrate[i - 1]) / + (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) / (rc->layer_framerate[i] - rc->layer_framerate[i - 1]); } rc->layer_input_frames[i] = 0; @@ -128,7 +129,7 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc, rc->layer_avg_rate_mismatch[i] = 100.0 * rc->layer_avg_rate_mismatch[i] / rc->layer_enc_frames[i]; printf("For layer#: %d \n", i); - printf("Bitrate (target vs actual): %d %f \n", cfg->ts_target_bitrate[i], + printf("Bitrate (target vs actual): %d %f \n", rc->layer_target_bitrate[i], rc->layer_encoding_bitrate[i]); printf("Average frame size (target vs actual): %f %f \n", rc->layer_pfb[i], rc->layer_avg_frame_size[i]); @@ -597,13 +598,16 @@ int main(int argc, char **argv) { for (i = min_args_base; (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) { - cfg.ts_target_bitrate[i - 11] = strtol(argv[i], NULL, 0); + rc.layer_target_bitrate[i - 11] = strtol(argv[i], NULL, 0); + if (strncmp(encoder->name, "vp8", 3) == 0) + cfg.ts_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11]; + else if (strncmp(encoder->name, "vp9", 3) == 0) + cfg.layer_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11]; } // Real time parameters. cfg.rc_dropframe_thresh = strtol(argv[9], NULL, 0); cfg.rc_end_usage = VPX_CBR; - cfg.rc_resize_allowed = 0; cfg.rc_min_quantizer = 2; cfg.rc_max_quantizer = 56; if (strncmp(encoder->name, "vp9", 3) == 0) @@ -614,6 +618,9 @@ int main(int argc, char **argv) { cfg.rc_buf_optimal_sz = 600; cfg.rc_buf_sz = 1000; + // Disable dynamic resizing by default. + cfg.rc_resize_allowed = 0; + // Use 1 thread as default. cfg.g_threads = 1; @@ -625,6 +632,8 @@ int main(int argc, char **argv) { // Disable automatic keyframe placement. cfg.kf_min_dist = cfg.kf_max_dist = 3000; + cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + set_temporal_layer_pattern(layering_mode, &cfg, layer_flags, @@ -633,8 +642,8 @@ int main(int argc, char **argv) { set_rate_control_metrics(&rc, &cfg); // Target bandwidth for the whole stream. - // Set to ts_target_bitrate for highest layer (total bitrate). - cfg.rc_target_bitrate = cfg.ts_target_bitrate[cfg.ts_number_layers - 1]; + // Set to layer_target_bitrate for highest layer (total bitrate). + cfg.rc_target_bitrate = rc.layer_target_bitrate[cfg.ts_number_layers - 1]; // Open input file. if (!(infile = fopen(argv[1], "rb"))) { @@ -677,15 +686,22 @@ int main(int argc, char **argv) { vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0); } else if (strncmp(encoder->name, "vp9", 3) == 0) { - vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed); - vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); - vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0); - vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0); - vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0); - vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1)); - if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0)) { - die_codec(&codec, "Failed to set SVC"); + vpx_svc_extra_cfg_t svc_params; + vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed); + vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); + vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0); + vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0); + vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0); + vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1)); + if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0)) + die_codec(&codec, "Failed to set SVC"); + for (i = 0; i < cfg.ts_number_layers; ++i) { + svc_params.max_quantizers[i] = cfg.rc_max_quantizer; + svc_params.min_quantizers[i] = cfg.rc_min_quantizer; } + svc_params.scaling_factor_num[0] = cfg.g_h; + svc_params.scaling_factor_den[0] = cfg.g_h; + vpx_codec_control(&codec, VP9E_SET_SVC_PARAMETERS, &svc_params); } if (strncmp(encoder->name, "vp8", 3) == 0) { vpx_codec_control(&codec, VP8E_SET_SCREEN_CONTENT_MODE, 0); diff --git a/chromium/third_party/libvpx/source/libvpx/libs.mk b/chromium/third_party/libvpx/source/libvpx/libs.mk index 6eee0039c29..6215990c9bf 100644 --- a/chromium/third_party/libvpx/source/libvpx/libs.mk +++ b/chromium/third_party/libvpx/source/libvpx/libs.mk @@ -25,7 +25,7 @@ $$(BUILD_PFX)$(1).h: $$(SRC_PATH_BARE)/$(2) @echo " [CREATE] $$@" $$(qexec)$$(SRC_PATH_BARE)/build/make/rtcd.pl --arch=$$(TGT_ISA) \ --sym=$(1) \ - --config=$$(CONFIG_DIR)$$(target)$$(if $$(FAT_ARCHS),,-$$(TOOLCHAIN)).mk \ + --config=$$(CONFIG_DIR)$$(target)-$$(TOOLCHAIN).mk \ $$(RTCD_OPTIONS) $$^ > $$@ CLEAN-OBJS += $$(BUILD_PFX)$(1).h RTCD += $$(BUILD_PFX)$(1).h @@ -34,13 +34,6 @@ endef CODEC_SRCS-yes += CHANGELOG CODEC_SRCS-yes += libs.mk -# If this is a universal (fat) binary, then all the subarchitectures have -# already been built and our job is to stitch them together. The -# BUILD_LIBVPX variable indicates whether we should be building -# (compiling, linking) the library. The LIPO_LIBVPX variable indicates -# that we're stitching. -$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes) - include $(SRC_PATH_BARE)/vpx/vpx_codec.mk CODEC_SRCS-yes += $(addprefix vpx/,$(call enabled,API_SRCS)) CODEC_DOC_SRCS += $(addprefix vpx/,$(call enabled,API_DOC_SRCS)) @@ -140,18 +133,18 @@ INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/% $(p)/Release/%) INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/% $(p)/Debug/%) endif -CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh -CODEC_SRCS-$(BUILD_LIBVPX) += build/make/rtcd.pl -CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emmintrin_compat.h -CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops.h -CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops_aligned.h -CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_once.h -CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c +CODEC_SRCS-yes += build/make/version.sh +CODEC_SRCS-yes += build/make/rtcd.pl +CODEC_SRCS-yes += vpx_ports/emmintrin_compat.h +CODEC_SRCS-yes += vpx_ports/mem_ops.h +CODEC_SRCS-yes += vpx_ports/mem_ops_aligned.h +CODEC_SRCS-yes += vpx_ports/vpx_once.h +CODEC_SRCS-yes += $(BUILD_PFX)vpx_config.c INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c ifeq ($(ARCH_X86)$(ARCH_X86_64),yes) INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm endif -CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com +CODEC_EXPORTS-yes += vpx/exports_com CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec @@ -218,7 +211,7 @@ vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \ --src-path-bare="$(SRC_PATH_BARE)" \ -PROJECTS-$(BUILD_LIBVPX) += vpx.$(VCPROJ_SFX) +PROJECTS-yes += vpx.$(VCPROJ_SFX) vpx.$(VCPROJ_SFX): vpx_config.asm vpx.$(VCPROJ_SFX): $(RTCD) @@ -226,31 +219,39 @@ vpx.$(VCPROJ_SFX): $(RTCD) endif else LIBVPX_OBJS=$(call objs,$(CODEC_SRCS)) -OBJS-$(BUILD_LIBVPX) += $(LIBVPX_OBJS) -LIBS-$(if $(BUILD_LIBVPX),$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a +OBJS-yes += $(LIBVPX_OBJS) +LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) - -BUILD_LIBVPX_SO := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED)) - SO_VERSION_MAJOR := 2 SO_VERSION_MINOR := 0 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib +SHARED_LIB_SUF := .dylib EXPORT_FILE := libvpx.syms LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, \ libvpx.dylib ) else +ifeq ($(filter os2%,$(TGT_OS)),$(TGT_OS)) +LIBVPX_SO := libvpx$(SO_VERSION_MAJOR).dll +SHARED_LIB_SUF := _dll.a +EXPORT_FILE := libvpx.def +LIBVPX_SO_SYMLINKS := +LIBVPX_SO_IMPLIB := libvpx_dll.a +else LIBVPX_SO := libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH) +SHARED_LIB_SUF := .so EXPORT_FILE := libvpx.ver LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, \ libvpx.so libvpx.so.$(SO_VERSION_MAJOR) \ libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR)) endif +endif -LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)\ - $(notdir $(LIBVPX_SO_SYMLINKS)) +LIBS-$(CONFIG_SHARED) += $(BUILD_PFX)$(LIBVPX_SO)\ + $(notdir $(LIBVPX_SO_SYMLINKS)) \ + $(if $(LIBVPX_SO_IMPLIB), $(BUILD_PFX)$(LIBVPX_SO_IMPLIB)) $(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) $(EXPORT_FILE) $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm $(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR) @@ -268,6 +269,19 @@ libvpx.syms: $(call enabled,CODEC_EXPORTS) $(qexec)awk '{print "_"$$2}' $^ >$@ CLEAN-OBJS += libvpx.syms +libvpx.def: $(call enabled,CODEC_EXPORTS) + @echo " [CREATE] $@" + $(qexec)echo LIBRARY $(LIBVPX_SO:.dll=) INITINSTANCE TERMINSTANCE > $@ + $(qexec)echo "DATA MULTIPLE NONSHARED" >> $@ + $(qexec)echo "EXPORTS" >> $@ + $(qexec)awk '!/vpx_svc_*/ {print "_"$$2}' $^ >>$@ +CLEAN-OBJS += libvpx.def + +libvpx_dll.a: $(LIBVPX_SO) + @echo " [IMPLIB] $@" + $(qexec)emximp -o $@ $< +CLEAN-OBJS += libvpx_dll.a + define libvpx_symlink_template $(1): $(2) @echo " [LN] $(2) $$@" @@ -283,11 +297,12 @@ $(eval $(call libvpx_symlink_template,\ $(LIBVPX_SO))) -INSTALL-LIBS-$(BUILD_LIBVPX_SO) += $(LIBVPX_SO_SYMLINKS) -INSTALL-LIBS-$(BUILD_LIBVPX_SO) += $(LIBSUBDIR)/$(LIBVPX_SO) +INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBVPX_SO_SYMLINKS) +INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBSUBDIR)/$(LIBVPX_SO) +INSTALL-LIBS-$(CONFIG_SHARED) += $(if $(LIBVPX_SO_IMPLIB),$(LIBSUBDIR)/$(LIBVPX_SO_IMPLIB)) -LIBS-$(BUILD_LIBVPX) += vpx.pc +LIBS-yes += vpx.pc vpx.pc: config.mk libs.mk @echo " [CREATE] $@" $(qexec)echo '# pkg-config file from libvpx $(VERSION_STRING)' > $@ @@ -313,9 +328,6 @@ INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc CLEAN-OBJS += vpx.pc endif -LIBS-$(LIPO_LIBVPX) += libvpx.a -$(eval $(if $(LIPO_LIBVPX),$(call lipo_lib_template,libvpx.a))) - # # Rule to make assembler configuration file from C configuration file # @@ -354,11 +366,15 @@ LIBVPX_TEST_DATA_PATH ?= . include $(SRC_PATH_BARE)/test/test.mk LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS)) -LIBVPX_TEST_BINS=./test_libvpx$(EXE_SFX) +LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX) LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\ $(call enabled,LIBVPX_TEST_DATA)) libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1) +TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX) +TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS)) +TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS))) + libvpx_test_srcs.txt: @echo " [CREATE] $@" @echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@ @@ -422,7 +438,25 @@ test_libvpx.$(VCPROJ_SFX): $(LIBVPX_TEST_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_ PROJECTS-$(CONFIG_MSVS) += test_libvpx.$(VCPROJ_SFX) -LIBVPX_TEST_BINS := $(addprefix $(TGT_OS:win64=x64)/Release/,$(notdir $(LIBVPX_TEST_BINS))) +LIBVPX_TEST_BIN := $(addprefix $(TGT_OS:win64=x64)/Release/,$(notdir $(LIBVPX_TEST_BIN))) + +ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),) +PROJECTS-$(CONFIG_MSVS) += test_intra_pred_speed.$(VCPROJ_SFX) +test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX) + @echo " [CREATE] $@" + $(qexec)$(GEN_VCPROJ) \ + --exe \ + --target=$(TOOLCHAIN) \ + --name=test_intra_pred_speed \ + -D_VARIADIC_MAX=10 \ + --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \ + --ver=$(CONFIG_VS_VERSION) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \ + -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ + -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^ +endif # TEST_INTRA_PRED_SPEED endif else @@ -433,45 +467,54 @@ ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS)) # Disabling pthreads globally will cause issues on darwin and possibly elsewhere $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0 endif -$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src -$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include -OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS) -LIBS-$(BUILD_LIBVPX) += $(BUILD_PFX)libgtest.a $(BUILD_PFX)libgtest_g.a +GTEST_INCLUDES := -I$(SRC_PATH_BARE)/third_party/googletest/src +GTEST_INCLUDES += -I$(SRC_PATH_BARE)/third_party/googletest/src/include +$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES) +OBJS-yes += $(GTEST_OBJS) +LIBS-yes += $(BUILD_PFX)libgtest.a $(BUILD_PFX)libgtest_g.a $(BUILD_PFX)libgtest_g.a: $(GTEST_OBJS) LIBVPX_TEST_OBJS=$(sort $(call objs,$(LIBVPX_TEST_SRCS))) -$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src -$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include -OBJS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_OBJS) -BINS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_BINS) +$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES) +OBJS-yes += $(LIBVPX_TEST_OBJS) +BINS-yes += $(LIBVPX_TEST_BIN) CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx) -CODEC_LIB_SUF=$(if $(CONFIG_SHARED),.so,.a) -$(foreach bin,$(LIBVPX_TEST_BINS),\ - $(if $(BUILD_LIBVPX),$(eval $(bin): \ - lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\ - $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\ - $(LIBVPX_TEST_OBJS) \ - -L. -lvpx -lgtest $(extralibs) -lm)\ - )))\ - $(if $(LIPO_LIBS),$(eval $(call lipo_bin_template,$(bin))))\ - -endif +CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a) +TEST_LIBS := lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a +$(LIBVPX_TEST_BIN): $(TEST_LIBS) +$(eval $(call linkerxx_template,$(LIBVPX_TEST_BIN), \ + $(LIBVPX_TEST_OBJS) \ + -L. -lvpx -lgtest $(extralibs) -lm)) + +ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),) +$(TEST_INTRA_PRED_SPEED_OBJS) $(TEST_INTRA_PRED_SPEED_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES) +OBJS-yes += $(TEST_INTRA_PRED_SPEED_OBJS) +BINS-yes += $(TEST_INTRA_PRED_SPEED_BIN) + +$(TEST_INTRA_PRED_SPEED_BIN): $(TEST_LIBS) +$(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \ + $(TEST_INTRA_PRED_SPEED_OBJS) \ + -L. -lvpx -lgtest $(extralibs) -lm)) +endif # TEST_INTRA_PRED_SPEED + +endif # CONFIG_UNIT_TESTS # Install test sources only if codec source is included INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\ $(shell find $(SRC_PATH_BARE)/third_party/googletest -type f)) INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(LIBVPX_TEST_SRCS) +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS) define test_shard_template test:: test_shard.$(1) -test_shard.$(1): $(LIBVPX_TEST_BINS) testdata +test-no-data-check:: test_shard_ndc.$(1) +test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN) @set -e; \ - for t in $(LIBVPX_TEST_BINS); do \ - export GTEST_SHARD_INDEX=$(1); \ - export GTEST_TOTAL_SHARDS=$(2); \ - $$$$t; \ - done + export GTEST_SHARD_INDEX=$(1); \ + export GTEST_TOTAL_SHARDS=$(2); \ + $(LIBVPX_TEST_BIN) +test_shard.$(1): testdata .PHONY: test_shard.$(1) endef @@ -516,15 +559,16 @@ ifeq ($(CONFIG_MSVS),yes) # TODO(tomfinegan): Support running the debug versions of tools? TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH)) endif -utiltest: testdata +utiltest utiltest-no-data-check: $(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \ --test-data-path $(LIBVPX_TEST_DATA_PATH) \ --bin-path $(TEST_BIN_PATH) $(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \ --test-data-path $(LIBVPX_TEST_DATA_PATH) \ --bin-path $(TEST_BIN_PATH) +utiltest: testdata else -utiltest: +utiltest utiltest-no-data-check: @echo Unit tests must be enabled to make the utiltest target. endif @@ -542,11 +586,12 @@ ifeq ($(CONFIG_MSVS),yes) # TODO(tomfinegan): Support running the debug versions of tools? EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release endif -exampletest: examples testdata +exampletest exampletest-no-data-check: examples $(qexec)$(SRC_PATH_BARE)/test/examples.sh \ --test-data-path $(LIBVPX_TEST_DATA_PATH) \ --bin-path $(EXAMPLES_BIN_PATH) +exampletest: testdata else -exampletest: +exampletest exampletest-no-data-check: @echo Unit tests must be enabled to make the exampletest target. endif diff --git a/chromium/third_party/libvpx/source/libvpx/md5_utils.c b/chromium/third_party/libvpx/source/libvpx/md5_utils.c index 8fb26e2084b..f4f893a2d67 100644 --- a/chromium/third_party/libvpx/source/libvpx/md5_utils.c +++ b/chromium/third_party/libvpx/source/libvpx/md5_utils.c @@ -24,7 +24,7 @@ #include "md5_utils.h" -void +static void byteSwap(UWORD32 *buf, unsigned words) { md5byte *p; diff --git a/chromium/third_party/libvpx/source/libvpx/rate_hist.c b/chromium/third_party/libvpx/source/libvpx/rate_hist.c index 1cef19bdd64..a77222b1618 100644 --- a/chromium/third_party/libvpx/source/libvpx/rate_hist.c +++ b/chromium/third_party/libvpx/source/libvpx/rate_hist.c @@ -88,6 +88,9 @@ void update_rate_histogram(struct rate_hist *hist, if (now < cfg->rc_buf_initial_sz) return; + if (!cfg->rc_target_bitrate) + return; + then = now; /* Sum the size over the past rc_buf_sz ms */ diff --git a/chromium/third_party/libvpx/source/libvpx/tools_common.c b/chromium/third_party/libvpx/source/libvpx/tools_common.c index e243a91575a..901734e0f34 100644 --- a/chromium/third_party/libvpx/source/libvpx/tools_common.c +++ b/chromium/third_party/libvpx/source/libvpx/tools_common.c @@ -140,7 +140,7 @@ static const VpxInterface vpx_encoders[] = { #endif }; -int get_vpx_encoder_count() { +int get_vpx_encoder_count(void) { return sizeof(vpx_encoders) / sizeof(vpx_encoders[0]); } @@ -170,7 +170,7 @@ static const VpxInterface vpx_decoders[] = { #endif }; -int get_vpx_decoder_count() { +int get_vpx_decoder_count(void) { return sizeof(vpx_decoders) / sizeof(vpx_decoders[0]); } diff --git a/chromium/third_party/libvpx/source/libvpx/tools_common.h b/chromium/third_party/libvpx/source/libvpx/tools_common.h index de6c38f0f5b..adccec88bb3 100644 --- a/chromium/third_party/libvpx/source/libvpx/tools_common.h +++ b/chromium/third_party/libvpx/source/libvpx/tools_common.h @@ -16,6 +16,7 @@ #include "vpx/vpx_codec.h" #include "vpx/vpx_image.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/msvc.h" #if CONFIG_ENCODERS #include "./y4minput.h" @@ -34,7 +35,6 @@ #if CONFIG_OS_SUPPORT #if defined(_MSC_VER) #include <io.h> /* NOLINT */ -#define snprintf _snprintf #define isatty _isatty #define fileno _fileno #else @@ -89,6 +89,7 @@ struct VpxInputContext { enum VideoFileType file_type; uint32_t width; uint32_t height; + struct VpxRational pixel_aspect_ratio; vpx_img_fmt_t fmt; vpx_bit_depth_t bit_depth; int only_i420; @@ -119,7 +120,7 @@ void warn(const char *fmt, ...); void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN; /* The tool including this file must define usage_exit() */ -void usage_exit() VPX_NO_RETURN; +void usage_exit(void) VPX_NO_RETURN; #undef VPX_NO_RETURN @@ -131,11 +132,11 @@ typedef struct VpxInterface { vpx_codec_iface_t *(*const codec_interface)(); } VpxInterface; -int get_vpx_encoder_count(); +int get_vpx_encoder_count(void); const VpxInterface *get_vpx_encoder_by_index(int i); const VpxInterface *get_vpx_encoder_by_name(const char *name); -int get_vpx_decoder_count(); +int get_vpx_decoder_count(void); const VpxInterface *get_vpx_decoder_by_index(int i); const VpxInterface *get_vpx_decoder_by_name(const char *name); const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc); diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c index b9d875a2ff7..8dfd4ce203e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c @@ -10,6 +10,7 @@ #include "vpx_config.h" +#include "alloccommon.h" #include "blockd.h" #include "vpx_mem/vpx_mem.h" #include "onyxc_int.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm deleted file mode 100644 index 39919579f80..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm +++ /dev/null @@ -1,154 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance16x16_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance16x16_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - -loop - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r5, [r2, #0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r5, [r2, #4] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r5, [r2, #8] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r5, [r2, #12] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - - END - diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm deleted file mode 100644 index 915ee499309..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm +++ /dev/null @@ -1,101 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance8x8_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance8x8_armv6| PROC - - push {r4-r10, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #8 ; set loop counter to 8 (=block height) - mov r4, #0 ; initialize sum = 0 - mov r5, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r6, [r0, #0x0] ; load 4 src pixels - ldr r7, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r6, r7 ; calculate difference - pld [r0, r1, lsl #1] - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r0, #0x4] ; load 4 src pixels - ldr r7, [r2, #0x4] ; load 4 ref pixels - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r6, r7 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 ; next row - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r8, [sp, #32] ; get address of sse - mul r1, r4, r4 ; sum * sum - str r5, [r8] ; store sse - sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) - - pop {r4-r10, pc} - - ENDP - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c deleted file mode 100644 index 1b1979073e5..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/variance_neon.c +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> -#include "vpx_ports/mem.h" - -unsigned int vp8_variance16x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance16x8_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 4; i++) { // variance16x8_neon_loop - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance8x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d2u8, d4u8, d6u8; - int16x4_t d22s16, d23s16, d24s16, d25s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // variance8x16_neon_loop - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d2u8, d6u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance8x8_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 2; i++) { // variance8x8_neon_loop - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c index 974d3b6532b..3c8ed11f070 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c @@ -12,7 +12,7 @@ #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" -static const uint16_t bilinear_taps_coeff[8][2] = { +static const uint8_t bilinear_taps_coeff[8][2] = { {128, 0}, {112, 16}, { 96, 32}, @@ -972,9 +972,9 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, int pixel_step, unsigned int output_height, unsigned int output_width, - const uint16_t *vpx_filter) { - const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]); - const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]); + const uint8_t *vpx_filter) { + const uint8x8_t f0 = vmov_n_u8(vpx_filter[0]); + const uint8x8_t f1 = vmov_n_u8(vpx_filter[1]); unsigned int i; for (i = 0; i < output_height; ++i) { const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c index 467a509420e..0f293f03d94 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/variance_arm.c @@ -9,10 +9,14 @@ */ #include "vpx_config.h" -#include "vp8_rtcd.h" +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vp8/common/variance.h" #include "vp8/common/filter.h" +// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder +#if CONFIG_VP8_ENCODER + #if HAVE_MEDIA #include "vp8/common/arm/bilinearfilter_arm.h" @@ -40,8 +44,8 @@ unsigned int vp8_sub_pixel_variance8x8_armv6 vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, 8, 8, 8, VFilter); - return vp8_variance8x8_armv6(second_pass, 8, dst_ptr, - dst_pixels_per_line, sse); + return vpx_variance8x8_media(second_pass, 8, dst_ptr, + dst_pixels_per_line, sse); } unsigned int vp8_sub_pixel_variance16x16_armv6 @@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6 vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, 16, 16, 16, VFilter); - var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr, - dst_pixels_per_line, sse); + var = vpx_variance16x16_media(second_pass, 16, dst_ptr, + dst_pixels_per_line, sse); } return var; } -#endif /* HAVE_MEDIA */ +#endif // HAVE_MEDIA #if HAVE_NEON @@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); } -#endif +#endif // HAVE_NEON +#endif // CONFIG_VP8_ENCODER diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/copy_c.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/copy_c.c index fd96c863491..e3392913f63 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/copy_c.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/copy_c.c @@ -11,11 +11,11 @@ #include <string.h> -#include "vpx_config.h" +#include "./vp8_rtcd.h" #include "vpx/vpx_integer.h" /* Copy 2 macroblocks to a buffer */ -void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride, +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/filter.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/filter.c index 25266f86827..84c608effaa 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/filter.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/filter.c @@ -10,6 +10,7 @@ #include "filter.h" +#include "./vp8_rtcd.h" DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) = { diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/generic/systemdependent.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/generic/systemdependent.c index d84df334810..4393ced48c8 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/generic/systemdependent.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/generic/systemdependent.c @@ -17,6 +17,7 @@ #include "vpx_ports/x86.h" #endif #include "vp8/common/onyxc_int.h" +#include "vp8/common/systemdependent.h" #if CONFIG_MULTITHREAD #if HAVE_UNISTD_H && !defined(__OS2__) diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/idctllm.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/idctllm.c index 47af52f04e7..f5403c5aaf7 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/idctllm.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/idctllm.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vp8_rtcd.h" /**************************************************************************** * Notes: diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c index d12dea19364..5c0680f42d4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c @@ -151,14 +151,14 @@ static void multiframe_quality_enhance_block if (blksize == 16) { - actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; - act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8; + actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; + act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8; #ifdef USE_SSD - vp8_variance16x16(y, y_stride, yd, yd_stride, &sse); + vpx_variance16x16(y, y_stride, yd, yd_stride, &sse); sad = (sse + 128)>>8; - vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse); + vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse); usad = (sse + 32)>>6; - vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse); + vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 32)>>6; #else sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; @@ -168,14 +168,14 @@ static void multiframe_quality_enhance_block } else /* if (blksize == 8) */ { - actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; - act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6; + actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; + act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6; #ifdef USE_SSD - vp8_variance8x8(y, y_stride, yd, yd_stride, &sse); + vpx_variance8x8(y, y_stride, yd, yd_stride, &sse); sad = (sse + 32)>>6; - vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse); + vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse); usad = (sse + 8)>>4; - vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse); + vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 8)>>4; #else sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c index 266431a3240..a4e6ae170c9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c @@ -427,7 +427,7 @@ void vp8_de_noise(VP8_COMMON *cm, } } -double vp8_gaussian(double sigma, double mu, double x) +static double gaussian(double sigma, double mu, double x) { return 1 / (sigma * sqrt(2.0 * 3.14159265)) * (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); @@ -455,7 +455,7 @@ static void fillrd(struct postproc_state *state, int q, int a) for (i = -32; i < 32; i++) { - const int v = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i)); + const int v = (int)(.5 + 256 * gaussian(sigma, 0, i)); if (v) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl index 56b7db7ec33..fed20887f8e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl @@ -215,7 +215,7 @@ $vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6; $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2; add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -# Disable neon while investigating https://code.google.com/p/webm/issues/detail?id=817 +#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=817 specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2/; $vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6; $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2; @@ -233,35 +233,11 @@ specialize qw/vp8_bilinear_predict8x4 mmx media neon/; $vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6; add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_bilinear_predict4x4 mmx media neon/; +#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=892 +specialize qw/vp8_bilinear_predict4x4 mmx media/; $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6; # -# Whole-pixel Variance -# -add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance4x4 mmx sse2/; -$vp8_variance4x4_sse2=vp8_variance4x4_wmt; - -add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance8x8 mmx sse2 media neon/; -$vp8_variance8x8_sse2=vp8_variance8x8_wmt; -$vp8_variance8x8_media=vp8_variance8x8_armv6; - -add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance8x16 mmx sse2 neon/; -$vp8_variance8x16_sse2=vp8_variance8x16_wmt; - -add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance16x8 mmx sse2 neon/; -$vp8_variance16x8_sse2=vp8_variance16x8_wmt; - -add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance16x16 mmx sse2 media neon/; -$vp8_variance16x16_sse2=vp8_variance16x16_wmt; -$vp8_variance16x16_media=vp8_variance16x16_armv6; - -# # Sub-pixel Variance # add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; @@ -269,10 +245,9 @@ specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/; $vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt; add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; -specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media neon_asm/; +specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media/; $vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt; $vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6; -$vp8_sub_pixel_variance8x8_neon_asm=vp8_sub_pixel_variance8x8_neon; add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/; @@ -309,31 +284,10 @@ $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6; if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") { # -# Sum of squares (vector) -# -add_proto qw/unsigned int vp8_get_mb_ss/, "const short *"; -specialize qw/vp8_get_mb_ss mmx sse2/; - -# -# SSE (Sum Squared Error) -# -add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; -specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/; -$vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt; - -add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_mse16x16 mmx sse2 media neon/; -$vp8_mse16x16_sse2=vp8_mse16x16_wmt; -$vp8_mse16x16_media=vp8_mse16x16_armv6; - -add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; -specialize qw/vp8_get4x4sse_cs mmx neon/; - -# # Block copy # if ($opts{arch} =~ /x86/) { - add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"; + add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n"; specialize qw/vp8_copy32xn sse2 sse3/; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h index 552a28025e6..c6c9f41bf6a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h @@ -29,7 +29,7 @@ typedef unsigned int(*vpx_sad_fn_t)( typedef void (*vp8_copy32xn_fn_t)( const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, + unsigned char *ref_ptr, int ref_stride, int n); @@ -39,6 +39,7 @@ typedef void (*vpx_sad_multi_fn_t)( const unsigned char *ref_array, int ref_stride, unsigned int *sad_array); + typedef void (*vpx_sad_multi_d_fn_t) ( const unsigned char *src_ptr, @@ -48,7 +49,7 @@ typedef void (*vpx_sad_multi_d_fn_t) unsigned int *sad_array ); -typedef unsigned int (*vp8_variance_fn_t) +typedef unsigned int (*vpx_variance_fn_t) ( const unsigned char *src_ptr, int source_stride, @@ -68,37 +69,14 @@ typedef unsigned int (*vp8_subpixvariance_fn_t) unsigned int *sse ); -typedef void (*vp8_ssimpf_fn_t) - ( - unsigned char *s, - int sp, - unsigned char *r, - int rp, - unsigned long *sum_s, - unsigned long *sum_r, - unsigned long *sum_sq_s, - unsigned long *sum_sq_r, - unsigned long *sum_sxr - ); - -typedef unsigned int (*vp8_getmbss_fn_t)(const short *); - -typedef unsigned int (*vp8_get16x16prederror_fn_t) - ( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int ref_stride - ); - typedef struct variance_vtable { vpx_sad_fn_t sdf; - vp8_variance_fn_t vf; + vpx_variance_fn_t vf; vp8_subpixvariance_fn_t svf; - vp8_variance_fn_t svf_halfpix_h; - vp8_variance_fn_t svf_halfpix_v; - vp8_variance_fn_t svf_halfpix_hv; + vpx_variance_fn_t svf_halfpix_h; + vpx_variance_fn_t svf_halfpix_v; + vpx_variance_fn_t svf_halfpix_hv; vpx_sad_multi_fn_t sdx3f; vpx_sad_multi_fn_t sdx8f; vpx_sad_multi_d_fn_t sdx4df; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/variance_c.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/variance_c.c index 773b655efc5..02915a4defd 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/variance_c.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/variance_c.c @@ -8,43 +8,34 @@ * be found in the AUTHORS file in the root of the source tree. */ - -#include "variance.h" +#include "./vp8_rtcd.h" #include "filter.h" +#include "variance.h" - -unsigned int vp8_get_mb_ss_c -( - const short *src_ptr -) -{ - unsigned int i = 0, sum = 0; - - do - { - sum += (src_ptr[i] * src_ptr[i]); - i++; - } - while (i < 256); - - return sum; +/* This is a bad idea. + * ctz = count trailing zeros */ +static int ctz(int a) { + int b = 0; + while (a != 1) { + a >>= 1; + b++; + } + return b; } - -static void variance( +static unsigned int variance( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, int w, int h, - unsigned int *sse, - int *sum) + unsigned int *sse) { int i, j; - int diff; + int diff, sum; - *sum = 0; + sum = 0; *sse = 0; for (i = 0; i < h; i++) @@ -52,114 +43,17 @@ static void variance( for (j = 0; j < w; j++) { diff = src_ptr[j] - ref_ptr[j]; - *sum += diff; + sum += diff; *sse += diff * diff; } src_ptr += source_stride; ref_ptr += recon_stride; } -} - - -unsigned int vp8_variance16x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} - -unsigned int vp8_variance8x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); + return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h))))); } -unsigned int vp8_variance16x8_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); -} - - -unsigned int vp8_variance8x8_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); -} - -unsigned int vp8_variance4x4_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); -} - - -unsigned int vp8_mse16x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return var; -} - - /**************************************************************************** * * ROUTINE : filter_block2d_bil_first_pass @@ -303,7 +197,7 @@ unsigned int vp8_sub_pixel_variance4x4_c /* Now filter Verticaly */ var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter); - return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse); } @@ -328,7 +222,7 @@ unsigned int vp8_sub_pixel_variance8x8_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter); - return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse); } unsigned int vp8_sub_pixel_variance16x16_c @@ -352,7 +246,7 @@ unsigned int vp8_sub_pixel_variance16x16_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter); - return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse); } @@ -392,21 +286,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c( } -unsigned int vp8_sub_pixel_mse16x16_c -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - vp8_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); - return *sse; -} - unsigned int vp8_sub_pixel_variance16x8_c ( const unsigned char *src_ptr, @@ -428,7 +307,7 @@ unsigned int vp8_sub_pixel_variance16x8_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter); - return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse); } unsigned int vp8_sub_pixel_variance8x16_c @@ -454,5 +333,5 @@ unsigned int vp8_sub_pixel_variance8x16_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter); - return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse); } diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_sse2.asm index 761433c11ea..26de5e86097 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_sse2.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_sse2.asm @@ -13,393 +13,6 @@ %define xmm_filter_shift 7 -;unsigned int vp8_get_mb_ss_sse2 -;( -; short *src_ptr -;) -global sym(vp8_get_mb_ss_sse2) PRIVATE -sym(vp8_get_mb_ss_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 1 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - mov rax, arg(0) ;[src_ptr] - mov rcx, 8 - pxor xmm4, xmm4 - -.NEXTROW: - movdqa xmm0, [rax] - movdqa xmm1, [rax+16] - movdqa xmm2, [rax+32] - movdqa xmm3, [rax+48] - pmaddwd xmm0, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - - paddd xmm0, xmm1 - paddd xmm2, xmm3 - paddd xmm4, xmm0 - paddd xmm4, xmm2 - - add rax, 0x40 - dec rcx - ja .NEXTROW - - movdqa xmm3,xmm4 - psrldq xmm4,8 - paddd xmm4,xmm3 - movdqa xmm3,xmm4 - psrldq xmm4,4 - paddd xmm4,xmm3 - movq rax,xmm4 - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_get16x16var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp8_get16x16var_sse2) PRIVATE -sym(vp8_get16x16var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - ; Prefetch data - lea rcx, [rax+rax*2] - prefetcht0 [rsi] - prefetcht0 [rsi+rax] - prefetcht0 [rsi+rax*2] - prefetcht0 [rsi+rcx] - lea rbx, [rsi+rax*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rax] - prefetcht0 [rbx+rax*2] - prefetcht0 [rbx+rcx] - - lea rcx, [rdx+rdx*2] - prefetcht0 [rdi] - prefetcht0 [rdi+rdx] - prefetcht0 [rdi+rdx*2] - prefetcht0 [rdi+rcx] - lea rbx, [rdi+rdx*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rdx] - prefetcht0 [rbx+rdx*2] - prefetcht0 [rbx+rcx] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - pxor xmm6, xmm6 ; clear xmm6 for accumulating sse - mov rcx, 16 - -.var16loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rdi] - - prefetcht0 [rsi+rax*8] - prefetcht0 [rdi+rdx*8] - - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 - - - punpcklbw xmm1, xmm0 - punpckhbw xmm3, xmm0 - - punpcklbw xmm2, xmm0 - punpckhbw xmm4, xmm0 - - - psubw xmm1, xmm2 - psubw xmm3, xmm4 - - paddw xmm7, xmm1 - pmaddwd xmm1, xmm1 - - paddw xmm7, xmm3 - pmaddwd xmm3, xmm3 - - paddd xmm6, xmm1 - paddd xmm6, xmm3 - - add rsi, rax - add rdi, rdx - - sub rcx, 1 - jnz .var16loop - - - movdqa xmm1, xmm6 - pxor xmm6, xmm6 - - pxor xmm5, xmm5 - punpcklwd xmm6, xmm7 - - punpckhwd xmm5, xmm7 - psrad xmm5, 16 - - psrad xmm6, 16 - paddd xmm6, xmm5 - - movdqa xmm2, xmm1 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddd xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddd xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movd DWORD PTR [rax], xmm7 - movd DWORD PTR [rdi], xmm1 - - - ; begin epilog - pop rdi - pop rsi - pop rbx - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - - -;unsigned int vp8_get8x8var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp8_get8x8var_sse2) PRIVATE -sym(vp8_get8x8var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - movq xmm1, QWORD PTR [rsi] - movq xmm2, QWORD PTR [rdi] - - punpcklbw xmm1, xmm0 - punpcklbw xmm2, xmm0 - - psubsw xmm1, xmm2 - paddw xmm7, xmm1 - - pmaddwd xmm1, xmm1 - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movq xmm2, QWORD PTR[rsi + rax * 2] - movq xmm3, QWORD PTR[rdi + rdx * 2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movdqa xmm6, xmm7 - punpcklwd xmm6, xmm0 - - punpckhwd xmm7, xmm0 - movdqa xmm2, xmm1 - - paddw xmm6, xmm7 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddw xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddw xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movq rdx, xmm7 - movsx rcx, dx - - mov dword ptr [rax], ecx - movd DWORD PTR [rdi], xmm1 - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - ;void vp8_filter_block2d_bil_var_sse2 ;( ; unsigned char *ref_ptr, diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_ssse3.c index 73eb90df61f..2a0df640a90 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_ssse3.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_ssse3.c @@ -8,19 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vp8_rtcd.h" #include "vpx_config.h" #include "vp8/common/variance.h" #include "vpx_ports/mem.h" -extern unsigned int vp8_get16x16var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); extern void vp8_half_horiz_vert_variance16x_h_sse2 ( const unsigned char *ref_ptr, diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_impl_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_impl_mmx.asm new file mode 100644 index 00000000000..97f25275df2 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_impl_mmx.asm @@ -0,0 +1,353 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define mmx_filter_shift 7 + +;void vp8_filter_block2d_bil4x4_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE +sym(vp8_filter_block2d_bil4x4_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + + mov rax, arg(4) ;HFilter ; + mov rdx, arg(5) ;VFilter ; + + mov rsi, arg(0) ;ref_ptr ; + mov rdi, arg(2) ;src_ptr ; + + mov rcx, 4 ; + pxor mm0, mm0 ; + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + movq mm5, mm1 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + add rsi, r8 +%endif + +.filter_block2d_bil4x4_var_mmx_loop: + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + movq mm3, mm5 ; + + movq mm5, mm1 ; + pmullw mm3, [rdx] ; + + pmullw mm1, [rdx+8] ; + paddw mm1, mm3 ; + + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + psraw mm1, mmx_filter_shift ; + + movd mm3, [rdi] ; + punpcklbw mm3, mm0 ; + + psubw mm1, mm3 ; + paddw mm6, mm1 ; + + pmaddwd mm1, mm1 ; + paddd mm7, mm1 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz .filter_block2d_bil4x4_var_mmx_loop ; + + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(6) ;sum + mov rsi, arg(7) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + + +;void vp8_filter_block2d_bil_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE +sym(vp8_filter_block2d_bil_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + mov rax, arg(5) ;HFilter ; + + mov rdx, arg(6) ;VFilter ; + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor mm0, mm0 ; + movq mm1, [rsi] ; + + movq mm3, [rsi+1] ; + movq mm2, mm1 ; + + movq mm4, mm3 ; + punpcklbw mm1, mm0 ; + + punpckhbw mm2, mm0 ; + pmullw mm1, [rax] ; + + pmullw mm2, [rax] ; + punpcklbw mm3, mm0 ; + + punpckhbw mm4, mm0 ; + pmullw mm3, [rax+8] ; + + pmullw mm4, [rax+8] ; + paddw mm1, mm3 ; + + paddw mm2, mm4 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + + psraw mm2, mmx_filter_shift ; + movq mm5, mm1 + + packuswb mm5, mm2 ; +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + add rsi, r8 +%endif + +.filter_block2d_bil_var_mmx_loop: + + movq mm1, [rsi] ; + movq mm3, [rsi+1] ; + + movq mm2, mm1 ; + movq mm4, mm3 ; + + punpcklbw mm1, mm0 ; + punpckhbw mm2, mm0 ; + + pmullw mm1, [rax] ; + pmullw mm2, [rax] ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + pmullw mm3, [rax+8] ; + pmullw mm4, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + psraw mm1, mmx_filter_shift ; + + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + psraw mm2, mmx_filter_shift ; + + movq mm3, mm5 ; + movq mm4, mm5 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + movq mm5, mm1 ; + packuswb mm5, mm2 ; + + pmullw mm3, [rdx] ; + pmullw mm4, [rdx] ; + + pmullw mm1, [rdx+8] ; + pmullw mm2, [rdx+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + psraw mm2, mmx_filter_shift ; + + movq mm3, [rdi] ; + movq mm4, mm3 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + psubw mm1, mm3 ; + psubw mm2, mm4 ; + + paddw mm6, mm1 ; + pmaddwd mm1, mm1 ; + + paddw mm6, mm2 ; + pmaddwd mm2, mm2 ; + + paddd mm7, mm1 ; + paddd mm7, mm2 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz .filter_block2d_bil_var_mmx_loop ; + + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(7) ;sum + mov rsi, arg(8) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +;short mmx_bi_rd[4] = { 64, 64, 64, 64}; +align 16 +mmx_bi_rd: + times 4 dw 64 diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_mmx.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_mmx.c index 02e02420f46..e594b1e65ee 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_mmx.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_mmx.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vp8_rtcd.h" #include "vpx_config.h" #include "vp8/common/variance.h" #include "vpx_ports/mem.h" @@ -34,25 +35,6 @@ extern void filter_block1d_v6_mmx short *filter ); -extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr); -extern unsigned int vp8_get8x8var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -extern unsigned int vp8_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); extern void vp8_filter_block2d_bil4x4_var_mmx ( const unsigned char *ref_ptr, @@ -77,127 +59,6 @@ extern void vp8_filter_block2d_bil_var_mmx unsigned int *sumsquared ); - -unsigned int vp8_variance4x4_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); - -} - -unsigned int vp8_variance8x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - - return (var - (((unsigned int)avg * avg) >> 6)); - -} - -unsigned int vp8_mse16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3; - - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - *sse = var; - return var; -} - - -unsigned int vp8_variance16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3, avg; - - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - avg = sum0 + sum1 + sum2 + sum3; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} - -unsigned int vp8_variance16x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - - -unsigned int vp8_variance8x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - - return (var - (((unsigned int)avg * avg) >> 7)); - -} - - unsigned int vp8_sub_pixel_variance4x4_mmx ( const unsigned char *src_ptr, @@ -286,20 +147,6 @@ unsigned int vp8_sub_pixel_variance16x16_mmx } -unsigned int vp8_sub_pixel_mse16x16_mmx( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); - return *sse; -} - unsigned int vp8_sub_pixel_variance16x8_mmx ( const unsigned char *src_ptr, diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_sse2.c index 1fe127bf2c6..1c15ed88097 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_variance_sse2.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vp8_rtcd.h" #include "vpx_config.h" #include "vp8/common/variance.h" #include "vpx_ports/mem.h" @@ -30,38 +31,6 @@ extern void vp8_filter_block2d_bil4x4_var_mmx unsigned int *sumsquared ); -extern unsigned int vp8_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -unsigned int vp8_get_mb_ss_sse2 -( - const short *src_ptr -); -unsigned int vp8_get16x16var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -unsigned int vp8_get8x8var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); void vp8_filter_block2d_bil_var_sse2 ( const unsigned char *ref_ptr, @@ -135,115 +104,6 @@ void vp8_half_vert_variance16x_h_sse2 unsigned int *sumsquared ); -unsigned int vp8_variance4x4_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); - -} - -unsigned int vp8_variance8x8_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); - -} - - -unsigned int vp8_variance16x16_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0; - int sum0; - - - vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - *sse = sse0; - return (sse0 - (((unsigned int)sum0 * sum0) >> 8)); -} -unsigned int vp8_mse16x16_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - - unsigned int sse0; - int sum0; - vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - *sse = sse0; - return sse0; - -} - - -unsigned int vp8_variance16x8_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - -unsigned int vp8_variance8x16_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - unsigned int vp8_sub_pixel_variance4x4_wmt ( const unsigned char *src_ptr, @@ -378,20 +238,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); } -unsigned int vp8_sub_pixel_mse16x16_wmt( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); - return *sse; -} - unsigned int vp8_sub_pixel_variance16x8_wmt ( const unsigned char *src_ptr, diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c index d7b8c76dc26..9015fcbb496 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c @@ -259,7 +259,7 @@ static int swap_frame_buffers (VP8_COMMON *cm) return err; } -int check_fragments_for_errors(VP8D_COMP *pbi) +static int check_fragments_for_errors(VP8D_COMP *pbi) { if (!pbi->ec_active && pbi->fragments.count <= 1 && pbi->fragments.sizes[0] == 0) diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm deleted file mode 100644 index 000805d4fed..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm +++ /dev/null @@ -1,138 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mse16x16_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -; -;note: Based on vp8_variance16x16_armv6. In this function, sum is never used. -; So, we can remove this part of calculation. - -|vp8_mse16x16_armv6| PROC - - push {r4-r9, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #16 ; set loop counter to 16 (=block height) - mov r4, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r5, [r0, #0x0] ; load 4 src pixels - ldr r6, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r5, r6 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0x4] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r2, #0x4] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - ldr r5, [r0, #0x8] ; load 4 src pixels - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r6, [r2, #0x8] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0xc] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r6, [r2, #0xc] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - subs r12, r12, #1 ; next row - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r1, [sp, #28] ; get address of sse - mov r0, r4 ; return sse - str r4, [r1] ; store sse - - pop {r4-r9, pc} - - ENDP - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.c deleted file mode 100644 index f806809df5b..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> - -unsigned int vp8_mse16x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - int64x1_t d0s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - q7s32 = vdupq_n_s32(0); - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // mse16x16_neon_loop - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q7s32 = vmlal_s16(q7s32, d22s16, d22s16); - q8s32 = vmlal_s16(q8s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q7s32 = vmlal_s16(q7s32, d26s16, d26s16); - q8s32 = vmlal_s16(q8s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q10s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q10s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} - -unsigned int vp8_get4x4sse_cs_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride) { - int16x4_t d22s16, d24s16, d26s16, d28s16; - int64x1_t d0s64; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); - - d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); - d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); - d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); - d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); - - q7s32 = vmull_s16(d22s16, d22s16); - q8s32 = vmull_s16(d24s16, d24s16); - q9s32 = vmull_s16(d26s16, d26s16); - q10s32 = vmull_s16(d28s16, d28s16); - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q9s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q9s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct.c index 091554a5d50..0c7198d5d3a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/dct.c @@ -11,6 +11,8 @@ #include <math.h> +#include "./vp8_rtcd.h" + void vp8_short_fdct4x4_c(short *input, short *output, int pitch) { int i; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c index 378e902c6a4..d381d8ddf45 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c @@ -11,6 +11,7 @@ #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "encodemb.h" #include "encodemv.h" #include "vp8/common/common.h" @@ -90,7 +91,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x ) * lambda using a non-linear combination (e.g., the smallest, or second * smallest, etc.). */ - act = vp8_variance16x16(x->src.y_buffer, + act = vpx_variance16x16(x->src.y_buffer, x->src.y_stride, VP8_VAR_OFFS, 0, &sse); act = act<<4; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeintra.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeintra.c index cfa4cb927f6..e2de5eecbc4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeintra.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeintra.c @@ -11,6 +11,7 @@ #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "quantize.h" #include "vp8/common/reconintra4x4.h" #include "encodemb.h" @@ -44,7 +45,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) } } - intra_pred_var = vp8_get_mb_ss(x->src_diff); + intra_pred_var = vpx_get_mb_ss(x->src_diff); return intra_pred_var; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c index 977b0b0321e..4e234ccd58b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c @@ -19,8 +19,6 @@ extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip); -extern void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm); - static THREAD_FUNCTION thread_loopfilter(void *p_data) { VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1); diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c index 75c1362610f..3deb4abb337 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c @@ -12,6 +12,7 @@ #include <limits.h> #include <stdio.h> +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "block.h" #include "onyx_int.h" @@ -34,8 +35,6 @@ /* #define OUTPUT_FPF 1 */ extern void vp8cx_frame_init_quantizer(VP8_COMP *cpi); -extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv); -extern void vp8_alloc_compressor_data(VP8_COMP *cpi); #define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q] extern int vp8_kf_boost_qadjustment[QINDEX_RANGE]; @@ -424,14 +423,14 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, /* Set up pointers for this macro block raw buffer */ raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset + d->offset); - vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride, - (unsigned int *)(raw_motion_err)); + vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride, + (unsigned int *)(raw_motion_err)); /* Set up pointers for this macro block recon buffer */ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset ); - vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride, - (unsigned int *)(best_motion_err)); + vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride, + (unsigned int *)(best_motion_err)); } static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, @@ -455,7 +454,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, int new_mv_mode_penalty = 256; /* override the default variance function to use MSE */ - v_fn_ptr.vf = vp8_mse16x16; + v_fn_ptr.vf = vpx_mse16x16; /* Set up pointers for this macro block recon buffer */ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; @@ -1329,8 +1328,6 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta return Q; } -extern void vp8_new_framerate(VP8_COMP *cpi, double framerate); - void vp8_init_second_pass(VP8_COMP *cpi) { FIRSTPASS_STATS this_frame; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.c index c61563c56f7..ad0e9308dc1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.c @@ -10,6 +10,7 @@ #include "vp8/common/blockd.h" +#include "modecosts.h" #include "onyx_int.h" #include "treewriter.h" #include "vp8/common/entropymode.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.h index 9281551c8d5..9871bfffdf9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/modecosts.h @@ -16,7 +16,9 @@ extern "C" { #endif -void vp8_init_mode_costs(VP8_COMP *x); +struct VP8_COMP; + +void vp8_init_mode_costs(struct VP8_COMP *x); #ifdef __cplusplus } // extern "C" diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c index 5b452312ed2..40e29e191af 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c @@ -587,7 +587,8 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) // Turn-off under certain conditions (i.e., away from key frame, and if // we are at good quality (low Q) and most of the blocks were skipped-encoded // in previous frame. - if (Q >= 100) { + int qp_thresh = (cpi->oxcf.screen_content_mode == 2) ? 80 : 100; + if (Q >= qp_thresh) { cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 10; } else if (cpi->frames_since_key > 250 && @@ -2011,6 +2012,8 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->source_alt_ref_active = 0; cpi->common.refresh_alt_ref_frame = 0; + cpi->force_maxqp = 0; + cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS cpi->b_calculate_ssimg = 0; @@ -2128,7 +2131,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) #endif cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16; - cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16; + cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h; cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v; @@ -2138,7 +2141,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d; cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8; - cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8; + cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL; @@ -2148,7 +2151,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d; cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16; - cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16; + cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL; @@ -2158,7 +2161,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d; cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8; - cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8; + cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL; @@ -2168,7 +2171,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d; cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4; - cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4; + cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL; @@ -2555,7 +2558,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride, { unsigned int sse; - vp8_mse16x16(orig + col, orig_stride, + vpx_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse); total_sse += sse; @@ -3381,7 +3384,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source, int index = block_index_row + (j >> 4); if (cpi->consec_zero_last[index] >= min_consec_zero_last) { unsigned int sse; - Total += vp8_mse16x16(src + j, + Total += vpx_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); @@ -3445,7 +3448,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) { int index = block_index_row + (j >> 4); if (cpi->consec_zero_last[index] >= min_consec_zero_last) { unsigned int sse; - const unsigned int var = vp8_variance16x16(src + j, + const unsigned int var = vpx_variance16x16(src + j, ystride, dst + j, ystride, @@ -3455,7 +3458,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) { // is small (to avoid effects from lighting change). if ((sse - var) < 128) { unsigned int sse2; - const unsigned int act = vp8_variance16x16(src + j, + const unsigned int act = vpx_variance16x16(src + j, ystride, const_source, 0, @@ -4184,7 +4187,10 @@ static void encode_frame_to_data_rate */ if (cpi->cyclic_refresh_mode_enabled) { - if (cpi->current_layer==0) + // Special case for screen_content_mode with golden frame updates. + int disable_cr_gf = (cpi->oxcf.screen_content_mode == 2 && + cm->refresh_golden_frame); + if (cpi->current_layer == 0 && cpi->force_maxqp == 0 && !disable_cr_gf) cyclic_background_refresh(cpi, Q, 0); else disable_segmentation(cpi); @@ -4406,6 +4412,11 @@ static void encode_frame_to_data_rate /* transform / motion compensation build reconstruction frame */ vp8_encode_frame(cpi); + if (cpi->oxcf.screen_content_mode == 2) { + if (vp8_drop_encodedframe_overshoot(cpi, Q)) + return; + } + cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi); cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0; #endif @@ -5982,7 +5993,8 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) for (j = 0; j < source->y_width; j += 16) { unsigned int sse; - Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); + Total += vpx_mse16x16(src + j, source->y_stride, + dst + j, dest->y_stride, &sse); } src += 16 * source->y_stride; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h index 82d7453902c..c48e2f4478b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h @@ -526,6 +526,8 @@ typedef struct VP8_COMP // Measure of average squared difference between source and denoised signal. int mse_source_denoised; + int force_maxqp; + #if CONFIG_MULTITHREAD /* multithread data */ int * mt_current_mb_col; @@ -714,6 +716,11 @@ typedef struct VP8_COMP } rd_costs; } VP8_COMP; +void vp8_alloc_compressor_data(VP8_COMP *cpi); +int vp8_reverse_trans(int x); +void vp8_new_framerate(VP8_COMP *cpi, double framerate); +void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm); + void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char *dest_end, unsigned long *size); diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c index c4c0e7e9e23..053bf119aa9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c @@ -11,6 +11,7 @@ #include <limits.h> #include "vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "onyx_int.h" #include "modecosts.h" #include "encodeintra.h" @@ -29,8 +30,6 @@ #include "denoising.h" #endif -extern int VP8_UVSSE(MACROBLOCK *x); - #ifdef SPEEDSTATS extern unsigned int cnt_pm; #endif @@ -38,8 +37,6 @@ extern unsigned int cnt_pm; extern const int vp8_ref_frame_order[MAX_MODES]; extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; -extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]); - // Fixed point implementation of a skin color classifier. Skin color // is model by a Gaussian distribution in the CbCr color space. // See ../../test/skin_color_detector_test.cc where the reference @@ -219,33 +216,6 @@ int vp8_get_inter_mbpred_error(MACROBLOCK *mb, } - -unsigned int vp8_get4x4sse_cs_c -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride -) -{ - int distortion = 0; - int r, c; - - for (r = 0; r < 4; r++) - { - for (c = 0; c < 4; c++) - { - int diff = src_ptr[c] - ref_ptr[c]; - distortion += diff * diff; - } - - src_ptr += source_stride; - ref_ptr += recon_stride; - } - - return distortion; -} - static int get_prediction_error(BLOCK *be, BLOCKD *b) { unsigned char *sptr; @@ -253,7 +223,7 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b) sptr = (*(be->base_src) + be->src); dptr = b->predictor; - return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16); + return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16); } @@ -1041,7 +1011,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, else { rate2 += rate; - distortion2 = vp8_variance16x16( + distortion2 = vpx_variance16x16( *(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); @@ -1070,7 +1040,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, xd->dst.y_stride, xd->predictor, 16); - distortion2 = vp8_variance16x16 + distortion2 = vpx_variance16x16 (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode]; @@ -1551,7 +1521,7 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) xd->dst.y_stride, xd->predictor, 16); - distortion = vp8_variance16x16 + distortion = vpx_variance16x16 (*(b->base_src), b->src_stride, xd->predictor, 16, &sse); rate = x->mbmode_cost[xd->frame_type][mode]; this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c index 890053dcfdc..875b37f6841 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c @@ -9,6 +9,7 @@ */ +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vp8/common/onyxc_int.h" #include "onyx_int.h" @@ -83,7 +84,7 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, for (j = 0; j < source->y_width; j += 16) { unsigned int sse; - Total += vp8_mse16x16(src + j, source->y_stride, + Total += vpx_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); } diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c index 25d7a4998cb..e8796a1fcfb 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c @@ -1215,6 +1215,11 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { int Q = cpi->active_worst_quality; + if (cpi->force_maxqp == 1) { + cpi->active_worst_quality = cpi->worst_quality; + return cpi->worst_quality; + } + /* Reset Zbin OQ value */ cpi->mb.zbin_over_quant = 0; @@ -1559,3 +1564,46 @@ int vp8_pick_frame_size(VP8_COMP *cpi) } return 1; } +// If this just encoded frame (mcomp/transform/quant, but before loopfilter and +// pack_bitstream) has large overshoot, and was not being encoded close to the +// max QP, then drop this frame and force next frame to be encoded at max QP. +// Condition this on 1 pass CBR with screen content mode and frame dropper off. +// TODO(marpan): Should do this exit condition during the encode_frame +// (i.e., halfway during the encoding of the frame) to save cycles. +int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { + if (cpi->pass == 0 && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->drop_frames_allowed == 0 && + cpi->common.frame_type != KEY_FRAME) { + // Note: the "projected_frame_size" from encode_frame() only gives estimate + // of mode/motion vector rate (in non-rd mode): so below we only require + // that projected_frame_size is somewhat greater than per-frame-bandwidth, + // but add additional condition with high threshold on prediction residual. + + // QP threshold: only allow dropping if we are not close to qp_max. + int thresh_qp = 3 * cpi->worst_quality >> 2; + // Rate threshold, in bytes. + int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3); + // Threshold for the average (over all macroblocks) of the pixel-sum + // residual error over 16x16 block. Should add QP dependence on threshold? + int thresh_pred_err_mb = (256 << 4); + int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs); + if (Q < thresh_qp && + cpi->projected_frame_size > thresh_rate && + pred_err_mb > thresh_pred_err_mb) { + // Drop this frame: advance frame counters, and set force_maxqp flag. + cpi->common.current_video_frame++; + cpi->frames_since_key++; + // Flag to indicate we will force next frame to be encoded at max QP. + cpi->force_maxqp = 1; + return 1; + } else { + cpi->force_maxqp = 0; + return 0; + } + cpi->force_maxqp = 0; + return 0; + } + cpi->force_maxqp = 0; + return 0; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.h index 829697f391f..703de9ff550 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.h @@ -30,6 +30,8 @@ extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_ /* return of 0 means drop frame */ extern int vp8_pick_frame_size(VP8_COMP *cpi); +extern int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q); + #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c index 9ccd85eb93f..17194f0d449 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c @@ -15,6 +15,7 @@ #include <assert.h> #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "tokenize.h" #include "treewriter.h" #include "onyx_int.h" @@ -507,9 +508,9 @@ int VP8_UVSSE(MACROBLOCK *x) } else { - vp8_variance8x8(uptr, pre_stride, + vpx_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2); - vp8_variance8x8(vptr, pre_stride, + vpx_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1); sse2 += sse1; } @@ -1783,7 +1784,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], if(threshold < x->encode_breakout) threshold = x->encode_breakout; - var = vp8_variance16x16 + var = vpx_variance16x16 (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.h index e0da35e203c..b4fcd10b61e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.h @@ -136,6 +136,9 @@ extern void vp8_mv_pred int near_sadidx[] ); void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]); +int VP8_UVSSE(MACROBLOCK *x); +int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]); +void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv); #ifdef __cplusplus } // extern "C" diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.c index 448217ff412..14282db8016 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_ssse3.c @@ -17,7 +17,7 @@ #include <intrin.h> #pragma intrinsic(_BitScanReverse) static int bsr(int mask) { - int eob; + unsigned long eob; _BitScanReverse(&eob, mask); eob++; if (mask == 0) diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk index b4c814075c7..236d8a5d803 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk +++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk @@ -86,8 +86,8 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm -VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_mmx.c -VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_impl_mmx.asm +VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_mmx.c +VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_impl_mmx.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm @@ -96,7 +96,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm -VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_sse2.c +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp8_variance_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm @@ -145,8 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM) @@ -168,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c index af9cc7320b9..8697377892e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c @@ -135,7 +135,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(cfg, g_w, 1, 16383); /* 14 bits available */ RANGE_CHECK(cfg, g_h, 1, 16383); /* 14 bits available */ RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); - RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den); + RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000); RANGE_CHECK_HI(cfg, g_profile, 3); RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); @@ -199,7 +199,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(vp8_cfg, arnr_strength, 6); RANGE_CHECK(vp8_cfg, arnr_type, 1, 3); RANGE_CHECK(vp8_cfg, cq_level, 0, 63); - RANGE_CHECK_BOOL(vp8_cfg, screen_content_mode); + RANGE_CHECK_HI(vp8_cfg, screen_content_mode, 2); if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q)) RANGE_CHECK(vp8_cfg, cq_level, cfg->rc_min_quantizer, cfg->rc_max_quantizer); @@ -478,8 +478,6 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, return res; } -int vp8_reverse_trans(int); - static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk index 05003017982..0b0f6a70a12 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk +++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk @@ -18,7 +18,6 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c #File list for media # encoder VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM) -VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM) #File list for neon @@ -27,5 +26,4 @@ VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c -VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c index 3c8c6a9348d..0233877dd38 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c @@ -9,6 +9,8 @@ */ #include <arm_neon.h> + +#include "vpx_ports/mem.h" #include "vp9/common/vp9_idct.h" void vp9_idct16x16_1_add_neon( diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c index d0e4b4f4014..0ce45f2bfa8 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c @@ -9,10 +9,12 @@ */ #include <arm_neon.h> -#include "vp9/common/vp9_idct.h" #include "./vpx_config.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_idct.h" + static INLINE void LD_16x8( uint8_t *d, int d_stride, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c index 7c8a930b645..f0457358e6c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c @@ -9,6 +9,8 @@ */ #include <arm_neon.h> + +#include "vpx_ports/mem.h" #include "vp9/common/vp9_idct.h" void vp9_idct4x4_1_add_neon( diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c index 24c29fb77f6..5369697c7d1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c @@ -9,6 +9,8 @@ */ #include <arm_neon.h> + +#include "vpx_ports/mem.h" #include "vp9/common/vp9_idct.h" void vp9_idct8x8_1_add_neon( diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c index d0beaa7208f..92706bf2c69 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c @@ -8,466 +8,815 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <stddef.h> #include <arm_neon.h> -void vp9_v_predictor_4x4_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +//------------------------------------------------------------------------------ +// DC 4x4 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_top = vcombine_u16(p1, p1); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_left = vcombine_u16(p1, p1); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 3); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 2); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 2); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); int i; - uint32x2_t d0u32 = vdup_n_u32(0); - (void)left; + for (i = 0; i < 4; ++i) { + vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0); + } + } +} + +void vp9_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_4x4(dst, stride, above, left, 1, 1); +} + +void vp9_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_4x4(dst, stride, NULL, left, 0, 1); +} + +void vp9_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_4x4(dst, stride, above, NULL, 1, 0); +} - d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); - for (i = 0; i < 4; i++, dst += y_stride) - vst1_lane_u32((uint32_t *)dst, d0u32, 0); - return; +void vp9_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_4x4(dst, stride, NULL, NULL, 0, 0); } -void vp9_v_predictor_8x8_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { +//------------------------------------------------------------------------------ +// DC 8x8 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_top = vcombine_u16(p2, p2); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_left = vcombine_u16(p2, p2); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 4); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 3); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 3); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); int i; - uint8x8_t d0u8 = vdup_n_u8(0); - (void)left; + for (i = 0; i < 8; ++i) { + vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); + } + } +} - d0u8 = vld1_u8(above); - for (i = 0; i < 8; i++, dst += y_stride) - vst1_u8(dst, d0u8); - return; +void vp9_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_8x8(dst, stride, above, left, 1, 1); } -void vp9_v_predictor_16x16_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { +void vp9_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_8x8(dst, stride, NULL, left, 0, 1); +} + +void vp9_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_8x8(dst, stride, above, NULL, 1, 0); +} + +void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_8x8(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A = vld1q_u8(above); // top row + const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_top = vcombine_u16(p3, p3); + } + + if (do_left) { + const uint8x16_t L = vld1q_u8(left); // left row + const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_left = vcombine_u16(p3, p3); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 5); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 4); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 4); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; - uint8x16_t q0u8 = vdupq_n_u8(0); - (void)left; + for (i = 0; i < 16; ++i) { + vst1q_u8(dst + i * stride, dc); + } + } +} + +void vp9_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_16x16(dst, stride, above, left, 1, 1); +} + +void vp9_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_16x16(dst, stride, NULL, left, 0, 1); +} - q0u8 = vld1q_u8(above); - for (i = 0; i < 16; i++, dst += y_stride) - vst1q_u8(dst, q0u8); - return; +void vp9_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_16x16(dst, stride, above, NULL, 1, 0); } -void vp9_v_predictor_32x32_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { +void vp9_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_16x16(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A0 = vld1q_u8(above); // top row + const uint8x16_t A1 = vld1q_u8(above + 16); + const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top + const uint16x8_t p1 = vpaddlq_u8(A1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_top = vcombine_u16(p5, p5); + } + + if (do_left) { + const uint8x16_t L0 = vld1q_u8(left); // left row + const uint8x16_t L1 = vld1q_u8(left + 16); + const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left + const uint16x8_t p1 = vpaddlq_u8(L1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_left = vcombine_u16(p5, p5); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 6); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 5); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 5); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)left; - - q0u8 = vld1q_u8(above); - q1u8 = vld1q_u8(above + 16); - for (i = 0; i < 32; i++, dst += y_stride) { - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); + for (i = 0; i < 32; ++i) { + vst1q_u8(dst + i * stride, dc); + vst1q_u8(dst + i * stride + 16, dc); } - return; + } } -void vp9_h_predictor_4x4_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d1u32 = vdup_n_u32(0); - (void)above; +void vp9_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_32x32(dst, stride, above, left, 1, 1); +} - d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); +void vp9_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_32x32(dst, stride, NULL, left, 0, 1); +} - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - return; +void vp9_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_32x32(dst, stride, above, NULL, 1, 0); } -void vp9_h_predictor_8x8_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint64x1_t d1u64 = vdup_n_u64(0); - (void)above; +void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_32x32(dst, stride, NULL, NULL, 0, 0); +} - d1u64 = vld1_u64((const uint64_t *)left); +// ----------------------------------------------------------------------------- - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); - vst1_u8(dst, d0u8); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); - vst1_u8(dst, d0u8); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); - vst1_u8(dst, d0u8); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); - vst1_u8(dst, d0u8); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); - vst1_u8(dst, d0u8); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); - vst1_u8(dst, d0u8); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); - vst1_u8(dst, d0u8); - dst += y_stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); +void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row + const uint64x1_t A1 = vshr_n_u64(A0, 8); + const uint64x1_t A2 = vshr_n_u64(A0, 16); + const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); + const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); + const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); + const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); + const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r0 = vreinterpret_u32_u8(avg2); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + (void)left; + vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); + vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); + vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); + vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); + dst[3 * stride + 3] = above[7]; +} + +void vp9_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 }; + static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 }; + const uint8x8_t sh_12345677 = vld1_u8(shuffle1); + const uint8x8_t sh_23456777 = vld1_u8(shuffle2); + const uint8x8_t A0 = vld1_u8(above); // top row + const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677); + const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777); + const uint8x8_t avg1 = vhadd_u8(A0, A2); + uint8x8_t row = vrhadd_u8(avg1, A1); + int i; + (void)left; + for (i = 0; i < 7; ++i) { + vst1_u8(dst + i * stride, row); + row = vtbl1_u8(row, sh_12345677); + } + vst1_u8(dst + i * stride, row); +} + +void vp9_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t A0 = vld1q_u8(above); // top row + const uint8x16_t above_right = vld1q_dup_u8(above + 15); + const uint8x16_t A1 = vextq_u8(A0, above_right, 1); + const uint8x16_t A2 = vextq_u8(A0, above_right, 2); + const uint8x16_t avg1 = vhaddq_u8(A0, A2); + uint8x16_t row = vrhaddq_u8(avg1, A1); + int i; + (void)left; + for (i = 0; i < 15; ++i) { + vst1q_u8(dst + i * stride, row); + row = vextq_u8(row, above_right, 1); + } + vst1q_u8(dst + i * stride, row); +} + +// ----------------------------------------------------------------------------- + +void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t XABCD_u8 = vld1_u8(above - 1); + const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); + const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); + const uint32x2_t zero = vdup_n_u32(0); + const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); + const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); + const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); + const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); + const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); + const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); + const uint8_t D = vget_lane_u8(XABCD_u8, 4); + const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); + const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); + const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); + const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r3 = vreinterpret_u32_u8(avg2); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); + vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); + vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); + vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); +} + +#if !HAVE_NEON_ASM + +void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint32x2_t d0u32 = vdup_n_u32(0); + (void)left; + + d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); + for (i = 0; i < 4; i++, dst += stride) + vst1_lane_u32((uint32_t *)dst, d0u32, 0); +} + +void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x8_t d0u8 = vdup_n_u8(0); + (void)left; + + d0u8 = vld1_u8(above); + for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8); - return; } -void vp9_h_predictor_16x16_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int j; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)above; +void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + for (i = 0; i < 16; i++, dst += stride) + vst1q_u8(dst, q0u8); +} + +void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + q1u8 = vld1q_u8(above + 16); + for (i = 0; i < 32; i++, dst += stride) { + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + } +} + +void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d1u32 = vdup_n_u32(0); + (void)above; + + d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); +} + +void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint64x1_t d1u64 = vdup_n_u64(0); + (void)above; + + d1u64 = vld1_u64((const uint64_t *)left); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); + vst1_u8(dst, d0u8); +} + +void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + dst += stride; + } +} + +void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + for (k = 0; k < 2; k++, left += 16) { q1u8 = vld1q_u8(left); d2u8 = vget_low_u8(q1u8); for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - dst += y_stride; - } - return; -} - -void vp9_h_predictor_32x32_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int j, k; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)above; - - for (k = 0; k < 2; k++, left += 16) { - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += y_stride; - } + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; } - return; + } } -void vp9_tm_predictor_4x4_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int i; - uint16x8_t q1u16, q3u16; - int16x8_t q1s16; - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d2u32 = vdup_n_u32(0); - - d0u8 = vdup_n_u8(above[-1]); - d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); - q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); - for (i = 0; i < 4; i++, dst += y_stride) { - q1u16 = vdupq_n_u16((uint16_t)left[i]); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), - vreinterpretq_s16_u16(q3u16)); - d0u8 = vqmovun_s16(q1s16); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - } - return; -} - -void vp9_tm_predictor_8x8_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int j; - uint16x8_t q0u16, q3u16, q10u16; - int16x8_t q0s16; - uint16x4_t d20u16; - uint8x8_t d0u8, d2u8, d30u8; - - d0u8 = vdup_n_u8(above[-1]); - d30u8 = vld1_u8(left); - d2u8 = vld1_u8(above); - q10u16 = vmovl_u8(d30u8); - q3u16 = vsubl_u8(d2u8, d0u8); +void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint16x8_t q1u16, q3u16; + int16x8_t q1s16; + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d2u32 = vdup_n_u32(0); + + d0u8 = vld1_dup_u8(above - 1); + d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); + q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); + for (i = 0; i < 4; i++, dst += stride) { + q1u16 = vdupq_n_u16((uint16_t)left[i]); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), + vreinterpretq_s16_u16(q3u16)); + d0u8 = vqmovun_s16(q1s16); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + } +} + +void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j; + uint16x8_t q0u16, q3u16, q10u16; + int16x8_t q0s16; + uint16x4_t d20u16; + uint8x8_t d0u8, d2u8, d30u8; + + d0u8 = vld1_dup_u8(above - 1); + d30u8 = vld1_u8(left); + d2u8 = vld1_u8(above); + q10u16 = vmovl_u8(d30u8); + q3u16 = vsubl_u8(d2u8, d0u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 1); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 2); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 3); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + } +} + +void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; + uint8x16_t q0u8, q1u8; + int16x8_t q0s16, q1s16, q8s16, q11s16; + uint16x4_t d20u16; + uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; + + q0u8 = vld1q_dup_u8(above - 1); + q1u8 = vld1q_u8(above); + q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + for (k = 0; k < 2; k++, left += 8) { + d18u8 = vld1_u8(left); + q10u16 = vmovl_u8(d18u8); d20u16 = vget_low_u16(q10u16); for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += y_stride; - q0u16 = vdupq_lane_u16(d20u16, 1); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += y_stride; - q0u16 = vdupq_lane_u16(d20u16, 2); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += y_stride; - q0u16 = vdupq_lane_u16(d20u16, 3); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += y_stride; - } - return; -} - -void vp9_tm_predictor_16x16_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; - uint8x16_t q0u8, q1u8; - int16x8_t q0s16, q1s16, q8s16, q11s16; - uint16x4_t d20u16; - uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; - - q0u8 = vdupq_n_u8(above[-1]); - q1u8 = vld1q_u8(above); - q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - for (k = 0; k < 2; k++, left += 8) { - d18u8 = vld1_u8(left); - q10u16 = vmovl_u8(d18u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q8u16 = vdupq_lane_u16(d20u16, 1); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q2u16)); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q3u16)); - q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q2u16)); - q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += y_stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += y_stride; - - q0u16 = vdupq_lane_u16(d20u16, 2); - q8u16 = vdupq_lane_u16(d20u16, 3); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q2u16)); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q3u16)); - q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q2u16)); - q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += y_stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += y_stride; - } + q0u16 = vdupq_lane_u16(d20u16, 0); + q8u16 = vdupq_lane_u16(d20u16, 1); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q2u16)); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q3u16)); + q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q2u16)); + q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d20u16, 2); + q8u16 = vdupq_lane_u16(d20u16, 3); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q2u16)); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q3u16)); + q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q2u16)); + q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += stride; } - return; -} - -void vp9_tm_predictor_32x32_neon( - uint8_t *dst, - ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; - uint8x16_t q0u8, q1u8, q2u8; - int16x8_t q12s16, q13s16, q14s16, q15s16; - uint16x4_t d6u16; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; - - q0u8 = vdupq_n_u8(above[-1]); - q1u8 = vld1q_u8(above); - q2u8 = vld1q_u8(above + 16); - q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); - q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); - for (k = 0; k < 4; k++, left += 8) { - d26u8 = vld1_u8(left); - q3u16 = vmovl_u8(d26u8); - d6u16 = vget_low_u16(q3u16); - for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { - q0u16 = vdupq_lane_u16(d6u16, 0); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += y_stride; - - q0u16 = vdupq_lane_u16(d6u16, 1); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += y_stride; - - q0u16 = vdupq_lane_u16(d6u16, 2); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += y_stride; - - q0u16 = vdupq_lane_u16(d6u16, 3); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += y_stride; - } + } +} + +void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; + uint8x16_t q0u8, q1u8, q2u8; + int16x8_t q12s16, q13s16, q14s16, q15s16; + uint16x4_t d6u16; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; + + q0u8 = vld1q_dup_u8(above - 1); + q1u8 = vld1q_u8(above); + q2u8 = vld1q_u8(above + 16); + q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); + q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); + for (k = 0; k < 4; k++, left += 8) { + d26u8 = vld1_u8(left); + q3u16 = vmovl_u8(d26u8); + d6u16 = vget_low_u16(q3u16); + for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { + q0u16 = vdupq_lane_u16(d6u16, 0); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 1); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 2); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 3); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; } - return; + } } +#endif // !HAVE_NEON_ASM diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm index dc9856fa887..14f574a50e1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm @@ -298,8 +298,7 @@ loop_h |vp9_tm_predictor_4x4_neon| PROC ; Load ytop_left = above[-1]; sub r12, r2, #1 - ldrb r12, [r12] - vdup.u8 d0, r12 + vld1.u8 {d0[]}, [r12] ; Load above 4 pixels vld1.32 {d2[0]}, [r2] @@ -309,10 +308,10 @@ loop_h ; Load left row by row and compute left + (above - ytop_left) ; 1st row and 2nd row - ldrb r12, [r3], #1 - ldrb r2, [r3], #1 - vdup.u16 q1, r12 - vdup.u16 q2, r2 + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3]! + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 vadd.s16 q1, q1, q3 vadd.s16 q2, q2, q3 vqmovun.s16 d0, q1 @@ -321,10 +320,10 @@ loop_h vst1.32 {d1[0]}, [r0], r1 ; 3rd row and 4th row - ldrb r12, [r3], #1 - ldrb r2, [r3], #1 - vdup.u16 q1, r12 - vdup.u16 q2, r2 + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3] + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 vadd.s16 q1, q1, q3 vadd.s16 q2, q2, q3 vqmovun.s16 d0, q1 @@ -345,8 +344,7 @@ loop_h |vp9_tm_predictor_8x8_neon| PROC ; Load ytop_left = above[-1]; sub r12, r2, #1 - ldrb r12, [r12] - vdup.u8 d0, r12 + vld1.8 {d0[]}, [r12] ; preload 8 left vld1.8 {d30}, [r3] @@ -418,8 +416,7 @@ loop_h |vp9_tm_predictor_16x16_neon| PROC ; Load ytop_left = above[-1]; sub r12, r2, #1 - ldrb r12, [r12] - vdup.u8 q0, r12 + vld1.8 {d0[]}, [r12] ; Load above 8 pixels vld1.8 {q1}, [r2] @@ -429,7 +426,7 @@ loop_h ; Compute above - ytop_left vsubl.u8 q2, d2, d0 - vsubl.u8 q3, d3, d1 + vsubl.u8 q3, d3, d0 vmovl.u8 q10, d18 @@ -512,8 +509,7 @@ loop_16x16_neon |vp9_tm_predictor_32x32_neon| PROC ; Load ytop_left = above[-1]; sub r12, r2, #1 - ldrb r12, [r12] - vdup.u8 q0, r12 + vld1.8 {d0[]}, [r12] ; Load above 32 pixels vld1.8 {q1}, [r2]! @@ -524,9 +520,9 @@ loop_16x16_neon ; Compute above - ytop_left vsubl.u8 q8, d2, d0 - vsubl.u8 q9, d3, d1 + vsubl.u8 q9, d3, d0 vsubl.u8 q10, d4, d0 - vsubl.u8 q11, d5, d1 + vsubl.u8 q11, d5, d0 vmovl.u8 q3, d26 diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c index 19c582fd109..202d9138199 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c index 132d88ce5f7..7ceebb6d88c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c @@ -12,6 +12,7 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c index 1990348b83a..280190a39ba 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c index fc44ffa311d..04d226663f9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h index 008cf8cacd9..675db654ab9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h @@ -14,6 +14,7 @@ #include <stdlib.h> #include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_onyxc_int.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_horiz_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_horiz_msa.c new file mode 100644 index 00000000000..89364cb95cb --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_horiz_msa.c @@ -0,0 +1,782 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/mips/msa/vp9_convolve_msa.h" + +static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst0, dst1, dst2, dst3, res2, res3; + v16u8 mask0, mask1, mask2, mask3; + v8i16 filt, res0, res1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, res0, res1); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SRARI_H2_SH(res0, res1, FILTER_BITS); + SAT_SH2_SH(res0, res1, 7); + PCKEV_B2_UB(res0, res0, res1, res1, res2, res3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + XORI_B2_128_UB(res2, res3); + AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8i16 filt, vec0, vec1, vec2, vec3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, vec0, vec1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, vec2, vec3); + SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS); + SAT_SH4_SH(vec0, vec1, vec2, vec3, 7); + PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2, + res3); + ILVR_D2_UB(res1, res0, res3, res2, res0, res2); + XORI_B2_128_UB(res0, res2); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); + ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4); + AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2); + ST4x8_UB(res0, res2, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0, dst1; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + src += (2 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, + vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + LD_UB2(dst, dst_stride, dst0, dst1); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst); + dst += dst_stride; + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, + vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(dst, 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + for (cnt = 0; cnt < 2; ++cnt) { + src0 = LD_SB(&src[cnt << 5]); + src2 = LD_SB(&src[16 + (cnt << 5)]); + src3 = LD_SB(&src[24 + (cnt << 5)]); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, + vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, + vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, + vec1, vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, + vec1, vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(&dst[cnt << 5], 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]); + } + + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; + v8u16 vec2, vec3, const255, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + const255 = (v8u16)__msa_ldi_h(255); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, FILTER_BITS); + MIN_UH2_UH(vec2, vec3, const255); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 vec4, vec5, vec6, vec7, const255, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + const255 = (v8u16)__msa_ldi_h(255); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + MIN_UH4_UH(vec4, vec5, vec6, vec7, const255); + PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); + AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v8u16 vec0, vec1, vec2, vec3, const255, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + const255 = (v8u16)__msa_ldi_h(255); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, + dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v8u16 vec0, vec1, vec2, vec3, const255, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + const255 = (v8u16)__msa_ldi_h(255); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); + LD_SB4(src, src_stride, src0, src1, src2, src3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, + dst, dst_stride); + } +} + +static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + const255 = (v8u16)__msa_ldi_h(255); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + MIN_UH4_UH(res0, res1, res2, res3, const255); + MIN_UH4_UH(res4, res5, res6, res7, const255); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + + for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + MIN_UH4_UH(res0, res1, res2, res3, const255); + MIN_UH4_UH(res4, res5, res6, res7, const255); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + const255 = (v8u16)__msa_ldi_h(255); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + MIN_UH4_UH(res0, res1, res2, res3, const255); + MIN_UH4_UH(res4, res5, res6, res7, const255); + LD_UB2(dst, 16, dst0, dst1); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); + dst += dst_stride; + LD_UB2(dst, 16, dst2, dst3); + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, const255, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + const255 = (v8u16)__msa_ldi_h(255); + + for (loop_cnt = height; loop_cnt--;) { + LD_SB4(src, 16, src0, src2, src4, src6); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + LD_UB4(dst, 16, dst0, dst1, dst2, dst3); + MIN_UH4_UH(out0, out1, out2, out3, const255); + MIN_UH4_UH(out4, out5, out6, out7, const255); + PCKEV_AVG_ST_UB(out1, out0, dst0, dst); + PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); + PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); + PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); + dst += dst_stride; + } +} + +void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + int8_t cnt, filt_hor[8]; + + if (16 != x_step_q4) { + vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + return; + } + + if (((const int32_t *)filter_x)[1] == 0x800000) { + vp9_convolve_avg(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + return; + } + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0) { + switch (w) { + case 4: + common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 8: + common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 16: + common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 32: + common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 64: + common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + default: + vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 8: + common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 16: + common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 32: + common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 64: + common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_hor, h); + break; + default: + vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + break; + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_msa.c new file mode 100644 index 00000000000..e9f3a9dc3f2 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_msa.c @@ -0,0 +1,679 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/mips/msa/vp9_convolve_msa.h" + +static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); + vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + + SRARI_H2_SH(res0, res1, FILTER_BITS); + SAT_SH2_SH(res0, res1, 7); + PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); + XORI_B2_128_UB(tmp0, tmp1); + AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out5 = hz_out9; + vec0 = vec2; + vec1 = vec3; + vec2 = vec4; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + hz_out6 = hz_out10; + out0 = out2; + out1 = out3; + out2 = out8; + out4 = out6; + out5 = out7; + out6 = out9; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 dst0, dst1, dst2, dst3, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, + dst4, dst6); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, + res2, res3); + AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, + res2, res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, + dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, + height); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + int8_t cnt, filt_hor[8], filt_ver[8]; + + if (16 != x_step_q4 || 16 != y_step_q4) { + vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + return; + } + + if (((const int32_t *)filter_x)[1] == 0x800000 && + ((const int32_t *)filter_y)[1] == 0x800000) { + vp9_convolve_avg(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + return; + } + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0 && + ((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 8: + common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 16: + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 32: + common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 64: + common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + default: + vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + break; + } + } else if (((const int32_t *)filter_x)[0] == 0 || + ((const int32_t *)filter_y)[0] == 0) { + vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_hor, filt_ver, h); + break; + case 8: + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_hor, filt_ver, h); + break; + case 16: + common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_hor, filt_ver, h); + break; + case 32: + common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_hor, filt_ver, h); + break; + case 64: + common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_hor, filt_ver, h); + break; + default: + vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + break; + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_vert_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_vert_msa.c new file mode 100644 index 00000000000..85dfd30c7de --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_vert_msa.c @@ -0,0 +1,753 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/mips/msa/vp9_convolve_msa.h" + +static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3, out; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, FILTER_BITS); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + + dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1, out2, out3; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, + dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height, + int32_t width) { + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16i8 filt0, filt1, filt2, filt3; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + src_tmp += (4 * src_stride); + + LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1, + dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 16); +} + +static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 32); +} + +static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 64); +} + +static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + src4 = LD_SB(src); + src += src_stride; + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16u8 src2110, src4332, src6554, src8776, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, + dst2, dst3); + ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src87_r, src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + if (4 == height) { + common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16u8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, + dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, + vec2, vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, + vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, + dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, + dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + if (4 == height) { + common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_UB2(src, 16, src0, src5); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5; + v16u8 src6, src7, src8, src9, src10, src11, filt0; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8u16 filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(dst, dst_stride, dst0, dst1); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(dst + 16, dst_stride, dst2, dst3); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(dst + 32, dst_stride, dst4, dst5); + LD_UB2(src + 48, src_stride, src10, src11); + LD_UB2(dst + 48, dst_stride, dst6, dst7); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + SAT_UH2_UH(tmp4, tmp5, 7); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + SAT_UH2_UH(tmp6, tmp7, 7); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + SAT_UH2_UH(tmp4, tmp5, 7); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48)); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + SAT_UH2_UH(tmp6, tmp7, 7); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + int8_t cnt, filt_ver[8]; + + if (16 != y_step_q4) { + vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + return; + } + + if (((const int32_t *)filter_y)[1] == 0x800000) { + vp9_convolve_avg(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + return; + } + + for (cnt = 0; cnt < 8; ++cnt) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 8: + common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 16: + common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 32: + common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 64: + common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + default: + vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 8: + common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 16: + common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_ver, h); + + break; + case 32: + common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 64: + common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filt_ver, h); + break; + default: + vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + break; + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c index e2247435e88..f175bf9b663 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c @@ -14,37 +14,29 @@ static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - v16i8 filt0, filt1, filt2, filt3; - v16i8 src0, src1, src2, src3; - v16u8 mask0, mask1, mask2, mask3; + v16u8 mask0, mask1, mask2, mask3, out; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v8i16 filt, out0, out1; - mask0 = LOAD_UB(&mc_filt_mask_arr[16]); - + mask0 = LD_UB(&mc_filt_mask_arr[16]); src -= 3; /* rearranging filter */ - filt = LOAD_SH(filter); - filt0 = (v16i8)__msa_splati_h(filt, 0); - filt1 = (v16i8)__msa_splati_h(filt, 1); - filt2 = (v16i8)__msa_splati_h(filt, 2); - filt3 = (v16i8)__msa_splati_h(filt, 3); + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); mask1 = mask0 + 2; mask2 = mask0 + 4; mask3 = mask0 + 6; - LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); - - XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128); - + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1); - - out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); - out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); - - PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride); + SRARI_H2_SH(out0, out1, FILTER_BITS); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); } static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, @@ -52,47 +44,36 @@ static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, int8_t *filter) { v16i8 filt0, filt1, filt2, filt3; v16i8 src0, src1, src2, src3; - v16u8 mask0, mask1, mask2, mask3; + v16u8 mask0, mask1, mask2, mask3, out; v8i16 filt, out0, out1, out2, out3; - mask0 = LOAD_UB(&mc_filt_mask_arr[16]); - + mask0 = LD_UB(&mc_filt_mask_arr[16]); src -= 3; /* rearranging filter */ - filt = LOAD_SH(filter); - filt0 = (v16i8)__msa_splati_h(filt, 0); - filt1 = (v16i8)__msa_splati_h(filt, 1); - filt2 = (v16i8)__msa_splati_h(filt, 2); - filt3 = (v16i8)__msa_splati_h(filt, 3); + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); mask1 = mask0 + 2; mask2 = mask0 + 4; mask3 = mask0 + 6; - LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); src += (4 * src_stride); - - XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1); - - LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); - - XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128); - + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out2, out3); - - out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); - out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); - out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); - out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); - - PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); - PCKEV_2B_XORI128_STORE_4_BYTES_4(out2, out3, dst, dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); } static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, @@ -108,82 +89,64 @@ static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - v16i8 filt0, filt1, filt2, filt3; - v16i8 src0, src1, src2, src3; - v16u8 mask0, mask1, mask2, mask3; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; v8i16 filt, out0, out1, out2, out3; - mask0 = LOAD_UB(&mc_filt_mask_arr[0]); - + mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; /* rearranging filter */ - filt = LOAD_SH(filter); - filt0 = (v16i8)__msa_splati_h(filt, 0); - filt1 = (v16i8)__msa_splati_h(filt, 1); - filt2 = (v16i8)__msa_splati_h(filt, 2); - filt3 = (v16i8)__msa_splati_h(filt, 3); + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); mask1 = mask0 + 2; mask2 = mask0 + 4; mask3 = mask0 + 6; - LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); - - XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128); - + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - - out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); - out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); - out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); - out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); - - PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); } static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; - v16i8 filt0, filt1, filt2, filt3; - v16i8 src0, src1, src2, src3; - v16u8 mask0, mask1, mask2, mask3; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; v8i16 filt, out0, out1, out2, out3; - mask0 = LOAD_UB(&mc_filt_mask_arr[0]); - + mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; /* rearranging filter */ - filt = LOAD_SH(filter); - filt0 = (v16i8)__msa_splati_h(filt, 0); - filt1 = (v16i8)__msa_splati_h(filt, 1); - filt2 = (v16i8)__msa_splati_h(filt, 2); - filt3 = (v16i8)__msa_splati_h(filt, 3); + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); mask1 = mask0 + 2; mask2 = mask0 + 4; mask3 = mask0 + 6; for (loop_cnt = (height >> 2); loop_cnt--;) { - LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); src += (4 * src_stride); - - XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - - out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); - out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); - out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); - out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); - - PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -202,48 +165,36 @@ static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; - v16i8 src0, src1, src2, src3; - v16i8 filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; v8i16 filt, out0, out1, out2, out3; - mask0 = LOAD_UB(&mc_filt_mask_arr[0]); - + mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; /* rearranging filter */ - filt = LOAD_SH(filter); - filt0 = (v16i8)__msa_splati_h(filt, 0); - filt1 = (v16i8)__msa_splati_h(filt, 1); - filt2 = (v16i8)__msa_splati_h(filt, 2); - filt3 = (v16i8)__msa_splati_h(filt, 3); + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); mask1 = mask0 + 2; mask2 = mask0 + 4; mask3 = mask0 + 6; for (loop_cnt = (height >> 1); loop_cnt--;) { - src0 = LOAD_SB(src); - src1 = LOAD_SB(src + 8); - src += src_stride; - src2 = LOAD_SB(src); - src3 = LOAD_SB(src + 8); - src += src_stride; - - XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128); - + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (2 * src_stride); HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - - out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); - out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); - out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); - out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); - - PCKEV_B_XORI128_STORE_VEC(out1, out0, dst); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); dst += dst_stride; - PCKEV_B_XORI128_STORE_VEC(out3, out2, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); dst += dst_stride; } } @@ -252,68 +203,56 @@ static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; - v16i8 src0, src1, src2, src3; - v16i8 filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; v8i16 filt, out0, out1, out2, out3; - mask0 = LOAD_UB(&mc_filt_mask_arr[0]); - + mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; /* rearranging filter */ - filt = LOAD_SH(filter); - filt0 = (v16i8)__msa_splati_h(filt, 0); - filt1 = (v16i8)__msa_splati_h(filt, 1); - filt2 = (v16i8)__msa_splati_h(filt, 2); - filt3 = (v16i8)__msa_splati_h(filt, 3); + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); mask1 = mask0 + 2; mask2 = mask0 + 4; mask3 = mask0 + 6; for (loop_cnt = (height >> 1); loop_cnt--;) { - src0 = LOAD_SB(src); - src2 = LOAD_SB(src + 16); - src3 = LOAD_SB(src + 24); - src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8); + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); src += src_stride; - - XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128); - + XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); - out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); - out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); - out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); - out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); - - src0 = LOAD_SB(src); - src2 = LOAD_SB(src + 16); - src3 = LOAD_SB(src + 24); - src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8); + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; - PCKEV_B_XORI128_STORE_VEC(out1, out0, dst); - PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16)); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); dst += dst_stride; - XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128); - + XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - - out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); - out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); - out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); - out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); - - PCKEV_B_XORI128_STORE_VEC(out1, out0, dst); - PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16)); - - src += src_stride; + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); dst += dst_stride; } } @@ -321,50 +260,55 @@ static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { - uint32_t loop_cnt, cnt; - v16i8 src0, src1, src2, src3; - v16i8 filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3; + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; v8i16 filt, out0, out1, out2, out3; - mask0 = LOAD_UB(&mc_filt_mask_arr[0]); - + mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= 3; /* rearranging filter */ - filt = LOAD_SH(filter); - filt0 = (v16i8)__msa_splati_h(filt, 0); - filt1 = (v16i8)__msa_splati_h(filt, 1); - filt2 = (v16i8)__msa_splati_h(filt, 2); - filt3 = (v16i8)__msa_splati_h(filt, 3); + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); mask1 = mask0 + 2; mask2 = mask0 + 4; mask3 = mask0 + 6; for (loop_cnt = height; loop_cnt--;) { - for (cnt = 0; cnt < 2; ++cnt) { - src0 = LOAD_SB(&src[cnt << 5]); - src2 = LOAD_SB(&src[16 + (cnt << 5)]); - src3 = LOAD_SB(&src[24 + (cnt << 5)]); - src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8); - - XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128); - - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - - out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7); - out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7); - out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7); - out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7); - - PCKEV_B_XORI128_STORE_VEC(out1, out0, &dst[cnt << 5]); - PCKEV_B_XORI128_STORE_VEC(out3, out2, &dst[16 + (cnt << 5)]); - } + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + + src0 = LD_SB(src + 32); + src2 = LD_SB(src + 48); + src3 = LD_SB(src + 56); + src1 = __msa_sldi_b(src2, src0, 8); src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst + 32); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 48); dst += dst_stride; } } @@ -372,124 +316,55 @@ static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - uint32_t out0, out1, out2, out3; v16i8 src0, src1, src2, src3, mask; - v16u8 vec0, vec1, filt0; - v16i8 res0, res1; + v16u8 filt0, vec0, vec1, res0, res1; v8u16 vec2, vec3, filt, const255; - mask = LOAD_SB(&mc_filt_mask_arr[16]); + mask = LD_SB(&mc_filt_mask_arr[16]); /* rearranging filter */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - const255 = (v8u16)__msa_ldi_h(255); - - LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); - - vec0 = (v16u8)__msa_vshf_b(mask, src1, src0); - vec1 = (v16u8)__msa_vshf_b(mask, src3, src2); - - vec2 = __msa_dotp_u_h(vec0, filt0); - vec3 = __msa_dotp_u_h(vec1, filt0); - - vec2 = (v8u16)__msa_srari_h((v8i16)vec2, FILTER_BITS); - vec3 = (v8u16)__msa_srari_h((v8i16)vec3, FILTER_BITS); - - vec2 = __msa_min_u_h(vec2, const255); - vec3 = __msa_min_u_h(vec3, const255); - - res0 = __msa_pckev_b((v16i8)vec2, (v16i8)vec2); - res1 = __msa_pckev_b((v16i8)vec3, (v16i8)vec3); - - out0 = __msa_copy_u_w((v4i32)res0, 0); - out1 = __msa_copy_u_w((v4i32)res0, 1); - out2 = __msa_copy_u_w((v4i32)res1, 0); - out3 = __msa_copy_u_w((v4i32)res1, 1); - - STORE_WORD(dst, out0); - dst += dst_stride; - STORE_WORD(dst, out1); - dst += dst_stride; - STORE_WORD(dst, out2); - dst += dst_stride; - STORE_WORD(dst, out3); + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + const255 = (v8u16) __msa_ldi_h(255); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, FILTER_BITS); + MIN_UH2_UH(vec2, vec3, const255); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - uint32_t out0, out1, out2, out3; - v16u8 filt0; + v16u8 vec0, vec1, vec2, vec3, filt0; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 vec0, vec1, vec2, vec3; - v8u16 vec4, vec5, vec6, vec7; v16i8 res0, res1, res2, res3; - v8u16 filt, const255; + v8u16 vec4, vec5, vec6, vec7, filt, const255; - mask = LOAD_SB(&mc_filt_mask_arr[16]); + mask = LD_SB(&mc_filt_mask_arr[16]); /* rearranging filter */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - const255 = (v8u16)__msa_ldi_h(255); - - LOAD_8VECS_SB(src, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); - - vec0 = (v16u8)__msa_vshf_b(mask, src1, src0); - vec1 = (v16u8)__msa_vshf_b(mask, src3, src2); - vec2 = (v16u8)__msa_vshf_b(mask, src5, src4); - vec3 = (v16u8)__msa_vshf_b(mask, src7, src6); - - vec4 = __msa_dotp_u_h(vec0, filt0); - vec5 = __msa_dotp_u_h(vec1, filt0); - vec6 = __msa_dotp_u_h(vec2, filt0); - vec7 = __msa_dotp_u_h(vec3, filt0); - - vec4 = (v8u16)__msa_srari_h((v8i16)vec4, FILTER_BITS); - vec5 = (v8u16)__msa_srari_h((v8i16)vec5, FILTER_BITS); - vec6 = (v8u16)__msa_srari_h((v8i16)vec6, FILTER_BITS); - vec7 = (v8u16)__msa_srari_h((v8i16)vec7, FILTER_BITS); - - vec4 = __msa_min_u_h(vec4, const255); - vec5 = __msa_min_u_h(vec5, const255); - vec6 = __msa_min_u_h(vec6, const255); - vec7 = __msa_min_u_h(vec7, const255); - - res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4); - res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5); - res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6); - res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7); - - out0 = __msa_copy_u_w((v4i32)res0, 0); - out1 = __msa_copy_u_w((v4i32)res0, 1); - out2 = __msa_copy_u_w((v4i32)res1, 0); - out3 = __msa_copy_u_w((v4i32)res1, 1); - - STORE_WORD(dst, out0); - dst += dst_stride; - STORE_WORD(dst, out1); - dst += dst_stride; - STORE_WORD(dst, out2); - dst += dst_stride; - STORE_WORD(dst, out3); - dst += dst_stride; - - out0 = __msa_copy_u_w((v4i32)res2, 0); - out1 = __msa_copy_u_w((v4i32)res2, 1); - out2 = __msa_copy_u_w((v4i32)res3, 0); - out3 = __msa_copy_u_w((v4i32)res3, 1); - - STORE_WORD(dst, out0); - dst += dst_stride; - STORE_WORD(dst, out1); - dst += dst_stride; - STORE_WORD(dst, out2); - dst += dst_stride; - STORE_WORD(dst, out3); + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + const255 = (v8u16) __msa_ldi_h(255); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + MIN_UH4_UH(vec4, vec5, vec6, vec7, const255); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, + res2, res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); } static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, @@ -507,149 +382,93 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, int8_t *filter) { v16u8 filt0; v16i8 src0, src1, src2, src3, mask; - v8u16 vec0, vec1, vec2, vec3; - v8u16 out0, out1, out2, out3; - v8u16 const255, filt; + v8u16 vec0, vec1, vec2, vec3, const255, filt; - mask = LOAD_SB(&mc_filt_mask_arr[0]); + mask = LD_SB(&mc_filt_mask_arr[0]); /* rearranging filter */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - const255 = (v8u16)__msa_ldi_h(255); - - LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); - - vec0 = (v8u16)__msa_vshf_b(mask, src0, src0); - vec1 = (v8u16)__msa_vshf_b(mask, src1, src1); - vec2 = (v8u16)__msa_vshf_b(mask, src2, src2); - vec3 = (v8u16)__msa_vshf_b(mask, src3, src3); - - vec0 = __msa_dotp_u_h((v16u8)vec0, filt0); - vec1 = __msa_dotp_u_h((v16u8)vec1, filt0); - vec2 = __msa_dotp_u_h((v16u8)vec2, filt0); - vec3 = __msa_dotp_u_h((v16u8)vec3, filt0); - - SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS); - - out0 = __msa_min_u_h(vec0, const255); - out1 = __msa_min_u_h(vec1, const255); - out2 = __msa_min_u_h(vec2, const255); - out3 = __msa_min_u_h(vec3, const255); - - PCKEV_B_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride); + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + const255 = (v8u16) __msa_ldi_h(255); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); + ST8x4_UB(src0, src1, dst, dst_stride); } static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { v16u8 filt0; - v16i8 src0, src1, src2, src3, mask; - v8u16 vec0, vec1, vec2, vec3; - v8u16 filt, const255; + v16i8 src0, src1, src2, src3, mask, out0, out1; + v8u16 vec0, vec1, vec2, vec3, filt, const255; - mask = LOAD_SB(&mc_filt_mask_arr[0]); + mask = LD_SB(&mc_filt_mask_arr[0]); /* rearranging filter */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); - const255 = (v8u16)__msa_ldi_h(255); + const255 = (v8u16) __msa_ldi_h(255); - LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); + LD_SB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - vec0 = (v8u16)__msa_vshf_b(mask, src0, src0); - vec1 = (v8u16)__msa_vshf_b(mask, src1, src1); - vec2 = (v8u16)__msa_vshf_b(mask, src2, src2); - vec3 = (v8u16)__msa_vshf_b(mask, src3, src3); - - vec0 = __msa_dotp_u_h((v16u8)vec0, filt0); - vec1 = __msa_dotp_u_h((v16u8)vec1, filt0); - vec2 = __msa_dotp_u_h((v16u8)vec2, filt0); - vec3 = __msa_dotp_u_h((v16u8)vec3, filt0); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); - SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS); - - vec0 = __msa_min_u_h(vec0, const255); - vec1 = __msa_min_u_h(vec1, const255); - vec2 = __msa_min_u_h(vec2, const255); - vec3 = __msa_min_u_h(vec3, const255); - - LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); + LD_SB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); dst += (4 * dst_stride); - vec0 = (v8u16)__msa_vshf_b(mask, src0, src0); - vec1 = (v8u16)__msa_vshf_b(mask, src1, src1); - vec2 = (v8u16)__msa_vshf_b(mask, src2, src2); - vec3 = (v8u16)__msa_vshf_b(mask, src3, src3); - - vec0 = __msa_dotp_u_h((v16u8)vec0, filt0); - vec1 = __msa_dotp_u_h((v16u8)vec1, filt0); - vec2 = __msa_dotp_u_h((v16u8)vec2, filt0); - vec3 = __msa_dotp_u_h((v16u8)vec3, filt0); - - SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS); - - vec0 = __msa_min_u_h(vec0, const255); - vec1 = __msa_min_u_h(vec1, const255); - vec2 = __msa_min_u_h(vec2, const255); - vec3 = __msa_min_u_h(vec3, const255); - - PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); dst += (4 * dst_stride); if (16 == height) { - LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); + LD_SB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - vec0 = (v8u16)__msa_vshf_b(mask, src0, src0); - vec1 = (v8u16)__msa_vshf_b(mask, src1, src1); - vec2 = (v8u16)__msa_vshf_b(mask, src2, src2); - vec3 = (v8u16)__msa_vshf_b(mask, src3, src3); - - vec0 = __msa_dotp_u_h((v16u8)vec0, filt0); - vec1 = __msa_dotp_u_h((v16u8)vec1, filt0); - vec2 = __msa_dotp_u_h((v16u8)vec2, filt0); - vec3 = __msa_dotp_u_h((v16u8)vec3, filt0); - - SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, - vec0, vec1, vec2, vec3, FILTER_BITS); - - vec0 = __msa_min_u_h(vec0, const255); - vec1 = __msa_min_u_h(vec1, const255); - vec2 = __msa_min_u_h(vec2, const255); - vec3 = __msa_min_u_h(vec3, const255); - - LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); + LD_SB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride); - dst += (4 * dst_stride); - - vec0 = (v8u16)__msa_vshf_b(mask, src0, src0); - vec1 = (v8u16)__msa_vshf_b(mask, src1, src1); - vec2 = (v8u16)__msa_vshf_b(mask, src2, src2); - vec3 = (v8u16)__msa_vshf_b(mask, src3, src3); - - vec0 = __msa_dotp_u_h((v16u8)vec0, filt0); - vec1 = __msa_dotp_u_h((v16u8)vec1, filt0); - vec2 = __msa_dotp_u_h((v16u8)vec2, filt0); - vec3 = __msa_dotp_u_h((v16u8)vec3, filt0); - - SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, - vec0, vec1, vec2, vec3, FILTER_BITS); - - vec0 = __msa_min_u_h(vec0, const255); - vec1 = __msa_min_u_h(vec1, const255); - vec2 = __msa_min_u_h(vec2, const255); - vec3 = __msa_min_u_h(vec3, const255); - - PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); } } @@ -668,136 +487,68 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7; - v8u16 filt, const255; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; - mask = LOAD_SB(&mc_filt_mask_arr[0]); + mask = LD_SB(&mc_filt_mask_arr[0]); loop_cnt = (height >> 2) - 1; /* rearranging filter */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - const255 = (v8u16)__msa_ldi_h(255); - - src0 = LOAD_SB(src); - src1 = LOAD_SB(src + 8); - src += src_stride; - src2 = LOAD_SB(src); - src3 = LOAD_SB(src + 8); - src += src_stride; - src4 = LOAD_SB(src); - src5 = LOAD_SB(src + 8); - src += src_stride; - src6 = LOAD_SB(src); - src7 = LOAD_SB(src + 8); - src += src_stride; - - vec0 = (v16u8)__msa_vshf_b(mask, src0, src0); - vec1 = (v16u8)__msa_vshf_b(mask, src1, src1); - vec2 = (v16u8)__msa_vshf_b(mask, src2, src2); - vec3 = (v16u8)__msa_vshf_b(mask, src3, src3); - vec4 = (v16u8)__msa_vshf_b(mask, src4, src4); - vec5 = (v16u8)__msa_vshf_b(mask, src5, src5); - vec6 = (v16u8)__msa_vshf_b(mask, src6, src6); - vec7 = (v16u8)__msa_vshf_b(mask, src7, src7); - - out0 = __msa_dotp_u_h(vec0, filt0); - out1 = __msa_dotp_u_h(vec1, filt0); - out2 = __msa_dotp_u_h(vec2, filt0); - out3 = __msa_dotp_u_h(vec3, filt0); - out4 = __msa_dotp_u_h(vec4, filt0); - out5 = __msa_dotp_u_h(vec5, filt0); - out6 = __msa_dotp_u_h(vec6, filt0); - out7 = __msa_dotp_u_h(vec7, filt0); - - out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS); - out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS); - out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS); - out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS); - out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS); - out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS); - out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS); - out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS); - - out0 = __msa_min_u_h(out0, const255); - out1 = __msa_min_u_h(out1, const255); - out2 = __msa_min_u_h(out2, const255); - out3 = __msa_min_u_h(out3, const255); - out4 = __msa_min_u_h(out4, const255); - out5 = __msa_min_u_h(out5, const255); - out6 = __msa_min_u_h(out6, const255); - out7 = __msa_min_u_h(out7, const255); - - PCKEV_B_STORE_VEC(out1, out0, dst); + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + + const255 = (v8u16) __msa_ldi_h(255); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + MIN_UH4_UH(out0, out1, out2, out3, const255); + MIN_UH4_UH(out4, out5, out6, out7, const255); + PCKEV_ST_SB(out0, out1, dst); dst += dst_stride; - PCKEV_B_STORE_VEC(out3, out2, dst); + PCKEV_ST_SB(out2, out3, dst); dst += dst_stride; - PCKEV_B_STORE_VEC(out5, out4, dst); + PCKEV_ST_SB(out4, out5, dst); dst += dst_stride; - PCKEV_B_STORE_VEC(out7, out6, dst); + PCKEV_ST_SB(out6, out7, dst); dst += dst_stride; for (; loop_cnt--;) { - src0 = LOAD_SB(src); - src1 = LOAD_SB(src + 8); - src += src_stride; - src2 = LOAD_SB(src); - src3 = LOAD_SB(src + 8); - src += src_stride; - src4 = LOAD_SB(src); - src5 = LOAD_SB(src + 8); - src += src_stride; - src6 = LOAD_SB(src); - src7 = LOAD_SB(src + 8); - src += src_stride; + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); - vec0 = (v16u8)__msa_vshf_b(mask, src0, src0); - vec1 = (v16u8)__msa_vshf_b(mask, src1, src1); - vec2 = (v16u8)__msa_vshf_b(mask, src2, src2); - vec3 = (v16u8)__msa_vshf_b(mask, src3, src3); - vec4 = (v16u8)__msa_vshf_b(mask, src4, src4); - vec5 = (v16u8)__msa_vshf_b(mask, src5, src5); - vec6 = (v16u8)__msa_vshf_b(mask, src6, src6); - vec7 = (v16u8)__msa_vshf_b(mask, src7, src7); - - out0 = __msa_dotp_u_h(vec0, filt0); - out1 = __msa_dotp_u_h(vec1, filt0); - out2 = __msa_dotp_u_h(vec2, filt0); - out3 = __msa_dotp_u_h(vec3, filt0); - out4 = __msa_dotp_u_h(vec4, filt0); - out5 = __msa_dotp_u_h(vec5, filt0); - out6 = __msa_dotp_u_h(vec6, filt0); - out7 = __msa_dotp_u_h(vec7, filt0); - - out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS); - out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS); - out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS); - out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS); - out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS); - out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS); - out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS); - out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS); - - out0 = __msa_min_u_h(out0, const255); - out1 = __msa_min_u_h(out1, const255); - out2 = __msa_min_u_h(out2, const255); - out3 = __msa_min_u_h(out3, const255); - out4 = __msa_min_u_h(out4, const255); - out5 = __msa_min_u_h(out5, const255); - out6 = __msa_min_u_h(out6, const255); - out7 = __msa_min_u_h(out7, const255); - - PCKEV_B_STORE_VEC(out1, out0, dst); + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + MIN_UH4_UH(out0, out1, out2, out3, const255); + MIN_UH4_UH(out4, out5, out6, out7, const255); + PCKEV_ST_SB(out0, out1, dst); dst += dst_stride; - PCKEV_B_STORE_VEC(out3, out2, dst); + PCKEV_ST_SB(out2, out3, dst); dst += dst_stride; - PCKEV_B_STORE_VEC(out5, out4, dst); + PCKEV_ST_SB(out4, out5, dst); dst += dst_stride; - PCKEV_B_STORE_VEC(out7, out6, dst); + PCKEV_ST_SB(out6, out7, dst); dst += dst_stride; } } @@ -807,72 +558,46 @@ static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7; - v8u16 filt, const255; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; - mask = LOAD_SB(&mc_filt_mask_arr[0]); + mask = LD_SB(&mc_filt_mask_arr[0]); /* rearranging filter */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); - const255 = (v8u16)__msa_ldi_h(255); + const255 = (v8u16) __msa_ldi_h(255); for (loop_cnt = height >> 1; loop_cnt--;) { - src0 = LOAD_SB(src); - src2 = LOAD_SB(src + 16); - src3 = LOAD_SB(src + 24); - src1 = __msa_sld_b(src2, src0, 8); + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); src += src_stride; - src4 = LOAD_SB(src); - src6 = LOAD_SB(src + 16); - src7 = LOAD_SB(src + 24); - src5 = __msa_sld_b(src6, src4, 8); + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); src += src_stride; - vec0 = (v16u8)__msa_vshf_b(mask, src0, src0); - vec1 = (v16u8)__msa_vshf_b(mask, src1, src1); - vec2 = (v16u8)__msa_vshf_b(mask, src2, src2); - vec3 = (v16u8)__msa_vshf_b(mask, src3, src3); - vec4 = (v16u8)__msa_vshf_b(mask, src4, src4); - vec5 = (v16u8)__msa_vshf_b(mask, src5, src5); - vec6 = (v16u8)__msa_vshf_b(mask, src6, src6); - vec7 = (v16u8)__msa_vshf_b(mask, src7, src7); - - out0 = __msa_dotp_u_h(vec0, filt0); - out1 = __msa_dotp_u_h(vec1, filt0); - out2 = __msa_dotp_u_h(vec2, filt0); - out3 = __msa_dotp_u_h(vec3, filt0); - out4 = __msa_dotp_u_h(vec4, filt0); - out5 = __msa_dotp_u_h(vec5, filt0); - out6 = __msa_dotp_u_h(vec6, filt0); - out7 = __msa_dotp_u_h(vec7, filt0); - - out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS); - out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS); - out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS); - out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS); - out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS); - out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS); - out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS); - out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS); - - out0 = __msa_min_u_h(out0, const255); - out1 = __msa_min_u_h(out1, const255); - out2 = __msa_min_u_h(out2, const255); - out3 = __msa_min_u_h(out3, const255); - out4 = __msa_min_u_h(out4, const255); - out5 = __msa_min_u_h(out5, const255); - out6 = __msa_min_u_h(out6, const255); - out7 = __msa_min_u_h(out7, const255); - - PCKEV_B_STORE_VEC(out1, out0, dst); - PCKEV_B_STORE_VEC(out3, out2, dst + 16); + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + MIN_UH4_UH(out0, out1, out2, out3, const255); + MIN_UH4_UH(out4, out5, out6, out7, const255); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); dst += dst_stride; - PCKEV_B_STORE_VEC(out5, out4, dst); - PCKEV_B_STORE_VEC(out7, out6, dst + 16); + PCKEV_ST_SB(out4, out5, dst); + PCKEV_ST_SB(out6, out7, dst + 16); dst += dst_stride; } } @@ -882,70 +607,42 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7; - v8u16 filt, const255; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; - mask = LOAD_SB(&mc_filt_mask_arr[0]); + mask = LD_SB(&mc_filt_mask_arr[0]); /* rearranging filter */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + filt = LD_UH(filter); + filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); - const255 = (v8u16)__msa_ldi_h(255); + const255 = (v8u16) __msa_ldi_h(255); for (loop_cnt = height; loop_cnt--;) { - src0 = LOAD_SB(src); - src2 = LOAD_SB(src + 16); - src4 = LOAD_SB(src + 32); - src6 = LOAD_SB(src + 48); - src7 = LOAD_SB(src + 56); - src1 = __msa_sld_b(src2, src0, 8); - src3 = __msa_sld_b(src4, src2, 8); - src5 = __msa_sld_b(src6, src4, 8); + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src4 = LD_SB(src + 32); + src6 = LD_SB(src + 48); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); src += src_stride; - vec0 = (v16u8)__msa_vshf_b(mask, src0, src0); - vec1 = (v16u8)__msa_vshf_b(mask, src1, src1); - vec2 = (v16u8)__msa_vshf_b(mask, src2, src2); - vec3 = (v16u8)__msa_vshf_b(mask, src3, src3); - vec4 = (v16u8)__msa_vshf_b(mask, src4, src4); - vec5 = (v16u8)__msa_vshf_b(mask, src5, src5); - vec6 = (v16u8)__msa_vshf_b(mask, src6, src6); - vec7 = (v16u8)__msa_vshf_b(mask, src7, src7); - - out0 = __msa_dotp_u_h(vec0, filt0); - out1 = __msa_dotp_u_h(vec1, filt0); - out2 = __msa_dotp_u_h(vec2, filt0); - out3 = __msa_dotp_u_h(vec3, filt0); - out4 = __msa_dotp_u_h(vec4, filt0); - out5 = __msa_dotp_u_h(vec5, filt0); - out6 = __msa_dotp_u_h(vec6, filt0); - out7 = __msa_dotp_u_h(vec7, filt0); - - out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS); - out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS); - out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS); - out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS); - out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS); - out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS); - out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS); - out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS); - - out0 = __msa_min_u_h(out0, const255); - out1 = __msa_min_u_h(out1, const255); - out2 = __msa_min_u_h(out2, const255); - out3 = __msa_min_u_h(out3, const255); - out4 = __msa_min_u_h(out4, const255); - out5 = __msa_min_u_h(out5, const255); - out6 = __msa_min_u_h(out6, const255); - out7 = __msa_min_u_h(out7, const255); - - PCKEV_B_STORE_VEC(out1, out0, dst); - PCKEV_B_STORE_VEC(out3, out2, dst + 16); - PCKEV_B_STORE_VEC(out5, out4, dst + 32); - PCKEV_B_STORE_VEC(out7, out6, dst + 48); + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + MIN_UH4_UH(out0, out1, out2, out3, const255); + MIN_UH4_UH(out4, out5, out6, out7, const255); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + PCKEV_ST_SB(out4, out5, dst + 32); + PCKEV_ST_SB(out6, out7, dst + 48); dst += dst_stride; } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c index d0c374648c9..b1279d97ce5 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c @@ -26,93 +26,68 @@ static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3; - v16u8 mask0, mask1, mask2, mask3; - v8i16 filt_horiz; - v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4; - v8i16 horiz_out5, horiz_out6, horiz_out7, horiz_out8, horiz_out9; - v8i16 tmp0, tmp1, out0, out1, out2, out3, out4; - v8i16 filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3; - - mask0 = LOAD_UB(&mc_filt_mask_arr[16]); + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + mask0 = LD_UB(&mc_filt_mask_arr[16]); src -= (3 + 3 * src_stride); /* rearranging filter */ - filt_horiz = LOAD_SH(filter_horiz); - filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0); - filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1); - filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2); - filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3); + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); mask1 = mask0 + 2; mask2 = mask0 + 4; mask3 = mask0 + 6; - LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); src += (7 * src_stride); - XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6, - src0, src1, src2, src3, src4, src5, src6, 128); - - horiz_out0 = HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3, - filt_horiz0, filt_horiz1, filt_horiz2, - filt_horiz3); - horiz_out2 = HORIZ_8TAP_FILT_2VECS(src2, src3, mask0, mask1, mask2, mask3, - filt_horiz0, filt_horiz1, filt_horiz2, - filt_horiz3); - horiz_out4 = HORIZ_8TAP_FILT_2VECS(src4, src5, mask0, mask1, mask2, mask3, - filt_horiz0, filt_horiz1, filt_horiz2, - filt_horiz3); - horiz_out5 = HORIZ_8TAP_FILT_2VECS(src5, src6, mask0, mask1, mask2, mask3, - filt_horiz0, filt_horiz1, filt_horiz2, - filt_horiz3); - horiz_out1 = (v8i16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8); - horiz_out3 = (v8i16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8); - - filt = LOAD_SH(filter_vert); - filt_vert0 = __msa_splati_h(filt, 0); - filt_vert1 = __msa_splati_h(filt, 1); - filt_vert2 = __msa_splati_h(filt, 2); - filt_vert3 = __msa_splati_h(filt, 3); - - out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2); - out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10); - src += (4 * src_stride); - - XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128); + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); - horiz_out7 = HORIZ_8TAP_FILT_2VECS(src7, src8, mask0, mask1, mask2, mask3, - filt_horiz0, filt_horiz1, filt_horiz2, - filt_horiz3); - horiz_out6 = (v8i16)__msa_sldi_b((v16i8)horiz_out7, (v16i8)horiz_out5, 8); + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6); + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); - tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1, - filt_vert2, filt_vert3); - - horiz_out9 = HORIZ_8TAP_FILT_2VECS(src9, src10, mask0, mask1, mask2, mask3, - filt_horiz0, filt_horiz1, filt_horiz2, - filt_horiz3); - horiz_out8 = (v8i16)__msa_sldi_b((v16i8)horiz_out9, (v16i8)horiz_out7, 8); - - out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8); - - tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vert0, filt_vert1, - filt_vert2, filt_vert3); - tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7); + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); - PCKEV_2B_XORI128_STORE_4_BYTES_4(tmp0, tmp1, dst, dst_stride); + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); + out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H2_SH(tmp0, tmp1, FILTER_BITS); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); - horiz_out5 = horiz_out9; - + hz_out5 = hz_out9; out0 = out2; out1 = out3; out2 = out4; @@ -125,108 +100,87 @@ static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3; - v8i16 filt_horiz, filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3; - v16u8 mask0, mask1, mask2, mask3; - v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3; - v8i16 horiz_out4, horiz_out5, horiz_out6, horiz_out7; - v8i16 horiz_out8, horiz_out9, horiz_out10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, vec0, vec1; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; - v8i16 tmp0, tmp1, tmp2, tmp3; - - mask0 = LOAD_UB(&mc_filt_mask_arr[0]); + mask0 = LD_UB(&mc_filt_mask_arr[0]); src -= (3 + 3 * src_stride); /* rearranging filter */ - filt_horiz = LOAD_SH(filter_horiz); - filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0); - filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1); - filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2); - filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3); + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); mask1 = mask0 + 2; mask2 = mask0 + 4; mask3 = mask0 + 6; - LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); src += (7 * src_stride); - XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6, - src0, src1, src2, src3, src4, src5, src6, 128); - - horiz_out0 = HORIZ_8TAP_FILT(src0, mask0, mask1, mask2, mask3, filt_horiz0, - filt_horiz1, filt_horiz2, filt_horiz3); - horiz_out1 = HORIZ_8TAP_FILT(src1, mask0, mask1, mask2, mask3, filt_horiz0, - filt_horiz1, filt_horiz2, filt_horiz3); - horiz_out2 = HORIZ_8TAP_FILT(src2, mask0, mask1, mask2, mask3, filt_horiz0, - filt_horiz1, filt_horiz2, filt_horiz3); - horiz_out3 = HORIZ_8TAP_FILT(src3, mask0, mask1, mask2, mask3, filt_horiz0, - filt_horiz1, filt_horiz2, filt_horiz3); - horiz_out4 = HORIZ_8TAP_FILT(src4, mask0, mask1, mask2, mask3, filt_horiz0, - filt_horiz1, filt_horiz2, filt_horiz3); - horiz_out5 = HORIZ_8TAP_FILT(src5, mask0, mask1, mask2, mask3, filt_horiz0, - filt_horiz1, filt_horiz2, filt_horiz3); - horiz_out6 = HORIZ_8TAP_FILT(src6, mask0, mask1, mask2, mask3, filt_horiz0, - filt_horiz1, filt_horiz2, filt_horiz3); - - filt = LOAD_SH(filter_vert); - filt_vert0 = __msa_splati_h(filt, 0); - filt_vert1 = __msa_splati_h(filt, 1); - filt_vert2 = __msa_splati_h(filt, 2); - filt_vert3 = __msa_splati_h(filt, 3); - - out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2); - out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4); - out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out1); - out5 = (v8i16)__msa_ilvev_b((v16i8)horiz_out4, (v16i8)horiz_out3); - out6 = (v8i16)__msa_ilvev_b((v16i8)horiz_out6, (v16i8)horiz_out5); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); for (loop_cnt = (height >> 2); loop_cnt--;) { - LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10); + LD_SB4(src, src_stride, src7, src8, src9, src10); src += (4 * src_stride); - XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128); - - horiz_out7 = HORIZ_8TAP_FILT(src7, mask0, mask1, mask2, mask3, filt_horiz0, - filt_horiz1, filt_horiz2, filt_horiz3); - - out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6); - tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1, - filt_vert2, filt_vert3); - tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7); - - horiz_out8 = HORIZ_8TAP_FILT(src8, mask0, mask1, mask2, mask3, filt_horiz0, - filt_horiz1, filt_horiz2, filt_horiz3); - - out7 = (v8i16)__msa_ilvev_b((v16i8)horiz_out8, (v16i8)horiz_out7); - tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vert0, filt_vert1, - filt_vert2, filt_vert3); - tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7); - - horiz_out9 = HORIZ_8TAP_FILT(src9, mask0, mask1, mask2, mask3, filt_horiz0, - filt_horiz1, filt_horiz2, filt_horiz3); - - out8 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8); - tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vert0, filt_vert1, - filt_vert2, filt_vert3); - tmp2 = SRARI_SATURATE_SIGNED_H(tmp2, FILTER_BITS, 7); - - horiz_out10 = HORIZ_8TAP_FILT(src10, mask0, mask1, mask2, mask3, - filt_horiz0, filt_horiz1, filt_horiz2, - filt_horiz3); - - out9 = (v8i16)__msa_ilvev_b((v16i8)horiz_out10, (v16i8)horiz_out9); - tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vert0, filt_vert1, - filt_vert2, filt_vert3); - tmp3 = SRARI_SATURATE_SIGNED_H(tmp3, FILTER_BITS, 7); - - PCKEV_B_4_XORI128_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + XORI_B4_128_SB(src7, src8, src9, src10); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + vec0 = PCKEV_XORI128_UB(tmp0, tmp1); + vec1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(vec0, vec1, dst, dst_stride); dst += (4 * dst_stride); - horiz_out6 = horiz_out10; - + hz_out6 = hz_out10; out0 = out2; out1 = out3; out2 = out8; @@ -279,175 +233,89 @@ static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { - uint32_t out0, out1, out2, out3; v16i8 src0, src1, src2, src3, src4, mask; - v16u8 res0, res1, horiz_vec; - v16u8 filt_vert, filt_horiz, vec0, vec1; - v8u16 filt, tmp0, tmp1; - v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4; + v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; - mask = LOAD_SB(&mc_filt_mask_arr[16]); + mask = LD_SB(&mc_filt_mask_arr[16]); /* rearranging filter */ - filt = LOAD_UH(filter_horiz); - filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0); - - filt = LOAD_UH(filter_vert); - filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0); - horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2); - horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4); - horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7); - - horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8); - horiz_out3 = (v8u16)__msa_pckod_d((v2i64)horiz_out4, (v2i64)horiz_out2); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2); - - tmp0 = __msa_dotp_u_h(vec0, filt_vert); - tmp1 = __msa_dotp_u_h(vec1, filt_vert); - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - - res0 = (v16u8)__msa_pckev_b((v16i8)tmp0, (v16i8)tmp0); - res1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp1); - - out0 = __msa_copy_u_w((v4i32)res0, 0); - out1 = __msa_copy_u_w((v4i32)res0, 1); - out2 = __msa_copy_u_w((v4i32)res1, 0); - out3 = __msa_copy_u_w((v4i32)res1, 1); - - STORE_WORD(dst, out0); - dst += dst_stride; - STORE_WORD(dst, out1); - dst += dst_stride; - STORE_WORD(dst, out2); - dst += dst_stride; - STORE_WORD(dst, out3); + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { - uint32_t out0, out1, out2, out3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; - v16u8 filt_horiz, filt_vert, horiz_vec; - v16u8 vec0, vec1, vec2, vec3; - v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3; - v8u16 vec4, vec5, vec6, vec7, filt; - v8u16 horiz_out4, horiz_out5, horiz_out6, horiz_out7, horiz_out8; v16i8 res0, res1, res2, res3; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; - mask = LOAD_SB(&mc_filt_mask_arr[16]); + mask = LD_SB(&mc_filt_mask_arr[16]); /* rearranging filter */ - filt = LOAD_UH(filter_horiz); - filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0); + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); - filt = LOAD_UH(filter_vert); - filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0); + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); - LOAD_8VECS_SB(src, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); src += (8 * src_stride); - src8 = LOAD_SB(src); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0); - horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2); - horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src4); - horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src6); - horiz_out6 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out6 = SRARI_SATURATE_UNSIGNED_H(horiz_out6, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src8, src8); - horiz_out8 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out8 = SRARI_SATURATE_UNSIGNED_H(horiz_out8, FILTER_BITS, 7); - - horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8); - horiz_out3 = (v8u16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8); - horiz_out5 = (v8u16)__msa_sldi_b((v16i8)horiz_out6, (v16i8)horiz_out4, 8); - horiz_out7 = (v8u16)__msa_pckod_d((v2i64)horiz_out8, (v2i64)horiz_out6); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2); - vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4); - vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6); - - vec4 = __msa_dotp_u_h(vec0, filt_vert); - vec5 = __msa_dotp_u_h(vec1, filt_vert); - vec6 = __msa_dotp_u_h(vec2, filt_vert); - vec7 = __msa_dotp_u_h(vec3, filt_vert); - - vec4 = SRARI_SATURATE_UNSIGNED_H(vec4, FILTER_BITS, 7); - vec5 = SRARI_SATURATE_UNSIGNED_H(vec5, FILTER_BITS, 7); - vec6 = SRARI_SATURATE_UNSIGNED_H(vec6, FILTER_BITS, 7); - vec7 = SRARI_SATURATE_UNSIGNED_H(vec7, FILTER_BITS, 7); - - res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4); - res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5); - res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6); - res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7); - - out0 = __msa_copy_u_w((v4i32)res0, 0); - out1 = __msa_copy_u_w((v4i32)res0, 1); - out2 = __msa_copy_u_w((v4i32)res1, 0); - out3 = __msa_copy_u_w((v4i32)res1, 1); - - STORE_WORD(dst, out0); - dst += dst_stride; - STORE_WORD(dst, out1); - dst += dst_stride; - STORE_WORD(dst, out2); - dst += dst_stride; - STORE_WORD(dst, out3); - dst += dst_stride; - - out0 = __msa_copy_u_w((v4i32)res2, 0); - out1 = __msa_copy_u_w((v4i32)res2, 1); - out2 = __msa_copy_u_w((v4i32)res3, 0); - out3 = __msa_copy_u_w((v4i32)res3, 1); - - STORE_WORD(dst, out0); - dst += dst_stride; - STORE_WORD(dst, out1); - dst += dst_stride; - STORE_WORD(dst, out2); - dst += dst_stride; - STORE_WORD(dst, out3); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, + vec4, vec5, vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + SAT_UH4_UH(vec4, vec5, vec6, vec7, 7); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, + res2, res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); } static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { if (4 == height) { - common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert); + common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); } else if (8 == height) { - common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert); + common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); } } @@ -455,63 +323,43 @@ static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_horiz, filt_vert, horiz_vec; - v16u8 vec0, vec1, vec2, vec3; - v8u16 horiz_out0, horiz_out1; - v8u16 tmp0, tmp1, tmp2, tmp3; + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; v8i16 filt; - mask = LOAD_SB(&mc_filt_mask_arr[0]); + mask = LD_SB(&mc_filt_mask_arr[0]); /* rearranging filter */ - filt = LOAD_SH(filter_horiz); - filt_horiz = (v16u8)__msa_splati_h(filt, 0); - - filt = LOAD_SH(filter_vert); - filt_vert = (v16u8)__msa_splati_h(filt, 0); - - LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4); - src += (5 * src_stride); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0); - horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); - horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vert); + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); - horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); - horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); - vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); - tmp1 = __msa_dotp_u_h(vec1, filt_vert); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); - horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3); - horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); - vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - tmp2 = __msa_dotp_u_h(vec2, filt_vert); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); - horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4); - horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); - vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); - tmp3 = __msa_dotp_u_h(vec3, filt_vert); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - - PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); } static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, @@ -522,106 +370,76 @@ static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_horiz, filt_vert, vec0, horiz_vec; - v8u16 horiz_out0, horiz_out1; - v8u16 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; v8i16 filt; - mask = LOAD_SB(&mc_filt_mask_arr[0]); + mask = LD_SB(&mc_filt_mask_arr[0]); /* rearranging filter */ - filt = LOAD_SH(filter_horiz); - filt_horiz = (v16u8)__msa_splati_h(filt, 0); + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); - filt = LOAD_SH(filter_vert); - filt_vert = (v16u8)__msa_splati_h(filt, 0); + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); - src0 = LOAD_SB(src); + src0 = LD_SB(src); src += src_stride; - horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0); - horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); for (loop_cnt = (height >> 3); loop_cnt--;) { - LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4); + LD_SB4(src, src_stride, src1, src2, src3, src4); src += (4 * src_stride); - horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); - horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - tmp1 = __msa_dotp_u_h(vec0, filt_vert); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); - horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); - horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); - tmp2 = (v8u16)__msa_dotp_u_h(vec0, filt_vert); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + SAT_UH2_UH(tmp1, tmp2, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); - horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3); - horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - tmp3 = __msa_dotp_u_h(vec0, filt_vert); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4); - horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); - - LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + LD_SB4(src, src_stride, src1, src2, src3, src4); src += (4 * src_stride); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp4 = __msa_dotp_u_h(vec0, filt_vt); - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); - tmp4 = __msa_dotp_u_h(vec0, filt_vert); - - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7); - - PCKEV_B_STORE_8_BYTES_4(tmp1, tmp2, tmp3, tmp4, dst, dst_stride); + SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); + SAT_UH2_UH(tmp3, tmp4, 7); + PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); dst += (4 * dst_stride); - horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); - horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - tmp5 = __msa_dotp_u_h(vec0, filt_vert); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); - horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); - tmp6 = __msa_dotp_u_h(vec0, filt_vert); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3); - horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp5 = __msa_dotp_u_h(vec0, filt_vt); - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - tmp7 = __msa_dotp_u_h(vec0, filt_vert); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp6 = __msa_dotp_u_h(vec0, filt_vt); - horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4); - horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp7 = __msa_dotp_u_h(vec0, filt_vt); - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); - tmp8 = __msa_dotp_u_h(vec0, filt_vert); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp8 = __msa_dotp_u_h(vec0, filt_vt); - tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7); - tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7); - tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7); - tmp8 = SRARI_SATURATE_UNSIGNED_H(tmp8, FILTER_BITS, 7); - - PCKEV_B_STORE_8_BYTES_4(tmp5, tmp6, tmp7, tmp8, dst, dst_stride); + SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); + SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7); + PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -645,108 +463,64 @@ static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt_horiz, filt_vert, vec0, horiz_vec; - v8u16 horiz_vec0, horiz_vec1, tmp1, tmp2; - v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3; + v16u8 filt_hz, filt_vt, vec0, vec1; + v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; v8i16 filt; - mask = LOAD_SB(&mc_filt_mask_arr[0]); + mask = LD_SB(&mc_filt_mask_arr[0]); /* rearranging filter */ - filt = LOAD_SH(filter_horiz); - filt_horiz = (v16u8)__msa_splati_h(filt, 0); - - filt = LOAD_SH(filter_vert); - filt_vert = (v16u8)__msa_splati_h(filt, 0); - - src0 = LOAD_SB(src); - src1 = LOAD_SB(src + 8); + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); - horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0); - horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); - horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + LD_SB2(src, 8, src0, src1); src += src_stride; + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + for (loop_cnt = (height >> 2); loop_cnt--;) { - LOAD_4VECS_SB(src, src_stride, src0, src2, src4, src6); - LOAD_4VECS_SB(src + 8, src_stride, src1, src3, src5, src7); + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); src += (4 * src_stride); - horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0); - horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); - horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - tmp1 = __msa_dotp_u_h(vec0, filt_vert); - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2); - tmp2 = __msa_dotp_u_h(vec0, filt_vert); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp2, tmp1, dst); + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); dst += dst_stride; - horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); - horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3); - horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); - tmp1 = __msa_dotp_u_h(vec0, filt_vert); - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3); - tmp2 = __msa_dotp_u_h(vec0, filt_vert); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp2, tmp1, dst); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); dst += dst_stride; - horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4); - horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src5); - horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); - tmp1 = __msa_dotp_u_h(vec0, filt_vert); - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2); - tmp2 = __msa_dotp_u_h(vec0, filt_vert); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp2, tmp1, dst); + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); dst += dst_stride; - horiz_vec = (v16u8)__msa_vshf_b(mask, src6, src6); - horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); - - horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src7); - horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); - horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); - tmp1 = __msa_dotp_u_h(vec0, filt_vert); - vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3); - tmp2 = __msa_dotp_u_h(vec0, filt_vert); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp2, tmp1, dst); + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + SAT_UH2_UH(tmp1, tmp2, 7); + PCKEV_ST_SB(tmp1, tmp2, dst); dst += dst_stride; } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c index 6b71ec1c0e4..e9ec2507a0a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c @@ -16,58 +16,48 @@ static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; - v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; - v16i8 src2110, src4332, src6554, src8776, src10998; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v16u8 out; v8i16 filt, out10, out32; - v16i8 filt0, filt1, filt2, filt3; src -= (3 * src_stride); - filt = LOAD_SH(filter); - filt0 = (v16i8)__msa_splati_h(filt, 0); - filt1 = (v16i8)__msa_splati_h(filt, 1); - filt2 = (v16i8)__msa_splati_h(filt, 2); - filt3 = (v16i8)__msa_splati_h(filt, 3); + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); src += (7 * src_stride); - ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5, - src1, src3, src5, src2, src4, src6, - src10_r, src32_r, src54_r, src21_r, src43_r, src65_r); - - ILVR_D_3VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r, - src6554, src65_r, src54_r); - - XORI_B_3VECS_SB(src2110, src4332, src6554, src2110, src4332, src6554, 128); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); for (loop_cnt = (height >> 2); loop_cnt--;) { - LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10); + LD_SB4(src, src_stride, src7, src8, src9, src10); src += (4 * src_stride); - ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10, - src76_r, src87_r, src98_r, src109_r); - - ILVR_D_2VECS_SB(src8776, src87_r, src76_r, src10998, src109_r, src98_r); - - XORI_B_2VECS_SB(src8776, src10998, src8776, src10998, 128); - - out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, - filt0, filt1, filt2, filt3); - out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, - filt0, filt1, filt2, filt3); - - out10 = SRARI_SATURATE_SIGNED_H(out10, FILTER_BITS, 7); - out32 = SRARI_SATURATE_SIGNED_H(out32, FILTER_BITS, 7); - - PCKEV_2B_XORI128_STORE_4_BYTES_4(out10, out32, dst, dst_stride); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, FILTER_BITS); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); src2110 = src6554; src4332 = src8776; src6554 = src10998; - src6 = src10; } } @@ -77,54 +67,115 @@ static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; - v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; - v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v16u8 tmp0, tmp1; v8i16 filt, out0_r, out1_r, out2_r, out3_r; src -= (3 * src_stride); - filt = LOAD_SH(filter); - filt0 = (v16i8)__msa_splati_h(filt, 0); - filt1 = (v16i8)__msa_splati_h(filt, 1); - filt2 = (v16i8)__msa_splati_h(filt, 2); - filt3 = (v16i8)__msa_splati_h(filt, 3); + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); src += (7 * src_stride); - - XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6, - src0, src1, src2, src3, src4, src5, src6, 128); - - ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5, - src1, src3, src5, src2, src4, src6, - src10_r, src32_r, src54_r, src21_r, src43_r, src65_r); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); for (loop_cnt = (height >> 2); loop_cnt--;) { - LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10); + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); src += (4 * src_stride); - XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); - ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10, - src76_r, src87_r, src98_r, src109_r); + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} - out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, - filt0, filt1, filt2, filt3); - out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, - filt0, filt1, filt2, filt3); - out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, - filt0, filt1, filt2, filt3); - out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, - filt0, filt1, filt2, filt3); +static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); - out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7); - out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7); - out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7); - out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7); + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); - PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0_r, out1_r, out2_r, out3_r, - dst, dst_stride); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, + tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); dst += (4 * dst_stride); src10_r = src54_r; @@ -133,7 +184,12 @@ static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, src21_r = src65_r; src43_r = src87_r; src65_r = src109_r; - + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; src6 = src10; } } @@ -147,89 +203,63 @@ static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt, cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16i8 filt0, filt1, filt2, filt3; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; - v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; - v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; - v16i8 src21_l, src43_l, src65_l, src87_l, src109_l; - v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; - v8i16 filt; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; src -= (3 * src_stride); - filt = LOAD_SH(filter); - filt0 = (v16i8)__msa_splati_h(filt, 0); - filt1 = (v16i8)__msa_splati_h(filt, 1); - filt2 = (v16i8)__msa_splati_h(filt, 2); - filt3 = (v16i8)__msa_splati_h(filt, 3); + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); for (cnt = (width >> 4); cnt--;) { src_tmp = src; dst_tmp = dst; - LOAD_7VECS_SB(src_tmp, src_stride, - src0, src1, src2, src3, src4, src5, src6); + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); src_tmp += (7 * src_stride); - - XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6, - src0, src1, src2, src3, src4, src5, src6, 128); - - ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5, - src1, src3, src5, src2, src4, src6, - src10_r, src32_r, src54_r, src21_r, src43_r, src65_r); - - ILVL_B_6VECS_SB(src0, src2, src4, src1, src3, src5, - src1, src3, src5, src2, src4, src6, - src10_l, src32_l, src54_l, src21_l, src43_l, src65_l); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, + src32_r, src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, + src32_l, src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); for (loop_cnt = (height >> 2); loop_cnt--;) { - LOAD_4VECS_SB(src_tmp, src_stride, src7, src8, src9, src10); + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); src_tmp += (4 * src_stride); - - XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128); - - ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10, - src76_r, src87_r, src98_r, src109_r); - - ILVL_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10, - src76_l, src87_l, src98_l, src109_l); - - out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, - filt0, filt1, filt2, filt3); - out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, - filt0, filt1, filt2, filt3); - out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, - filt0, filt1, filt2, filt3); - out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, - filt0, filt1, filt2, filt3); - - out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, - filt0, filt1, filt2, filt3); - out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, - filt0, filt1, filt2, filt3); - out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, - filt0, filt1, filt2, filt3); - out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, - filt0, filt1, filt2, filt3); - - out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7); - out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7); - out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7); - out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7); - out0_l = SRARI_SATURATE_SIGNED_H(out0_l, FILTER_BITS, 7); - out1_l = SRARI_SATURATE_SIGNED_H(out1_l, FILTER_BITS, 7); - out2_l = SRARI_SATURATE_SIGNED_H(out2_l, FILTER_BITS, 7); - out3_l = SRARI_SATURATE_SIGNED_H(out3_l, FILTER_BITS, 7); - - out0_r = (v8i16)__msa_pckev_b((v16i8)out0_l, (v16i8)out0_r); - out1_r = (v8i16)__msa_pckev_b((v16i8)out1_l, (v16i8)out1_r); - out2_r = (v8i16)__msa_pckev_b((v16i8)out2_l, (v16i8)out2_r); - out3_r = (v8i16)__msa_pckev_b((v16i8)out3_l, (v16i8)out3_r); - - XORI_B_4VECS_UB(out0_r, out1_r, out2_r, out3_r, - tmp0, tmp1, tmp2, tmp3, 128); - - STORE_4VECS_UB(dst_tmp, dst_stride, tmp0, tmp1, tmp2, tmp3); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); dst_tmp += (4 * dst_stride); src10_r = src54_r; @@ -238,14 +268,12 @@ static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, src21_r = src65_r; src43_r = src87_r; src65_r = src109_r; - src10_l = src54_l; src32_l = src76_l; src54_l = src98_l; src21_l = src65_l; src43_l = src87_l; src65_l = src109_l; - src6 = src10; } @@ -254,134 +282,77 @@ static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, } } -static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, - filter, height, 16); -} - static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { - common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, - filter, height, 32); + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 32); } static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { - common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, - filter, height, 64); + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 64); } static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - uint32_t out0, out1, out2, out3; v16i8 src0, src1, src2, src3, src4; v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; - v16i8 filt0; - v8u16 filt; + v16u8 filt0; + v8i16 filt; + v8u16 tmp0, tmp1; - filt = LOAD_UH(filter); - filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0); + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); - LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4); + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); src += (5 * src_stride); - ILVR_B_4VECS_SB(src0, src1, src2, src3, src1, src2, src3, src4, - src10_r, src21_r, src32_r, src43_r); - - ILVR_D_2VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r); - - src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0); - src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0); - - src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7); - src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7); - - src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110); - - out0 = __msa_copy_u_w((v4i32)src2110, 0); - out1 = __msa_copy_u_w((v4i32)src2110, 1); - out2 = __msa_copy_u_w((v4i32)src2110, 2); - out3 = __msa_copy_u_w((v4i32)src2110, 3); - - STORE_WORD(dst, out0); - dst += dst_stride; - STORE_WORD(dst, out1); - dst += dst_stride; - STORE_WORD(dst, out2); - dst += dst_stride; - STORE_WORD(dst, out3); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); } static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - uint32_t out0, out1, out2, out3, out4, out5, out6, out7; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; - v16i8 filt0; - v8u16 filt; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 filt; - filt = LOAD_UH(filter); - filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0); + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); - LOAD_8VECS_SB(src, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); src += (8 * src_stride); - src8 = LOAD_SB(src); + src8 = LD_SB(src); src += src_stride; - ILVR_B_8VECS_SB(src0, src1, src2, src3, src4, src5, src6, src7, - src1, src2, src3, src4, src5, src6, src7, src8, - src10_r, src21_r, src32_r, src43_r, - src54_r, src65_r, src76_r, src87_r); - - ILVR_D_4VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r, - src6554, src65_r, src54_r, src8776, src87_r, src76_r); - - src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0); - src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0); - src6554 = (v16i8)__msa_dotp_u_h((v16u8)src6554, (v16u8)filt0); - src8776 = (v16i8)__msa_dotp_u_h((v16u8)src8776, (v16u8)filt0); - - src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7); - src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7); - src6554 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src6554, FILTER_BITS, 7); - src8776 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src8776, FILTER_BITS, 7); - - src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110); - src4332 = (v16i8)__msa_pckev_b((v16i8)src8776, (v16i8)src6554); - - out0 = __msa_copy_u_w((v4i32)src2110, 0); - out1 = __msa_copy_u_w((v4i32)src2110, 1); - out2 = __msa_copy_u_w((v4i32)src2110, 2); - out3 = __msa_copy_u_w((v4i32)src2110, 3); - out4 = __msa_copy_u_w((v4i32)src4332, 0); - out5 = __msa_copy_u_w((v4i32)src4332, 1); - out6 = __msa_copy_u_w((v4i32)src4332, 2); - out7 = __msa_copy_u_w((v4i32)src4332, 3); - - STORE_WORD(dst, out0); - dst += dst_stride; - STORE_WORD(dst, out1); - dst += dst_stride; - STORE_WORD(dst, out2); - dst += dst_stride; - STORE_WORD(dst, out3); - dst += dst_stride; - STORE_WORD(dst, out4); - dst += dst_stride; - STORE_WORD(dst, out5); - dst += dst_stride; - STORE_WORD(dst, out6); - dst += dst_stride; - STORE_WORD(dst, out7); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src87_r, src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); } static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, @@ -397,32 +368,24 @@ static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - v16u8 src0, src1, src2, src3, src4; - v16u8 vec0, vec1, vec2, vec3, filt0; + v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + v16i8 out0, out1; v8u16 tmp0, tmp1, tmp2, tmp3; - v8u16 filt; + v8i16 filt; /* rearranging filter_y */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LOAD_5VECS_UB(src, src_stride, src0, src1, src2, src3, src4); - - ILVR_B_2VECS_UB(src0, src1, src1, src2, vec0, vec1); - ILVR_B_2VECS_UB(src2, src3, src3, src4, vec2, vec3); - - /* filter calc */ - tmp0 = __msa_dotp_u_h(vec0, filt0); - tmp1 = __msa_dotp_u_h(vec1, filt0); - tmp2 = __msa_dotp_u_h(vec2, filt0); - tmp3 = __msa_dotp_u_h(vec3, filt0); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - - PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); } static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, @@ -431,51 +394,39 @@ static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint32_t loop_cnt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v16i8 out0, out1; v8u16 tmp0, tmp1, tmp2, tmp3; - v8u16 filt; + v8i16 filt; /* rearranging filter_y */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); - src0 = LOAD_UB(src); + src0 = LD_UB(src); src += src_stride; for (loop_cnt = (height >> 3); loop_cnt--;) { - LOAD_8VECS_UB(src, src_stride, - src1, src2, src3, src4, src5, src6, src7, src8); + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); src += (8 * src_stride); - ILVR_B_4VECS_UB(src0, src1, src2, src3, src1, src2, src3, src4, - vec0, vec1, vec2, vec3); - - ILVR_B_4VECS_UB(src4, src5, src6, src7, src5, src6, src7, src8, - vec4, vec5, vec6, vec7); - - tmp0 = __msa_dotp_u_h(vec0, filt0); - tmp1 = __msa_dotp_u_h(vec1, filt0); - tmp2 = __msa_dotp_u_h(vec2, filt0); - tmp3 = __msa_dotp_u_h(vec3, filt0); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - - PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, + vec2, vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, + vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); dst += (4 * dst_stride); - tmp0 = __msa_dotp_u_h(vec4, filt0); - tmp1 = __msa_dotp_u_h(vec5, filt0); - tmp2 = __msa_dotp_u_h(vec6, filt0); - tmp3 = __msa_dotp_u_h(vec7, filt0); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - - PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); dst += (4 * dst_stride); src0 = src8; @@ -499,57 +450,45 @@ static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride, v16u8 src0, src1, src2, src3, src4; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; v8u16 tmp0, tmp1, tmp2, tmp3; - v8u16 filt; + v8i16 filt; /* rearranging filter_y */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); - src0 = LOAD_UB(src); + src0 = LD_UB(src); src += src_stride; for (loop_cnt = (height >> 2); loop_cnt--;) { - LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4); + LD_UB4(src, src_stride, src1, src2, src3, src4); src += (4 * src_stride); - ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2); - - tmp0 = __msa_dotp_u_h(vec0, filt0); - tmp1 = __msa_dotp_u_h(vec1, filt0); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp1, tmp0, dst); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst); dst += dst_stride; - ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6); - - tmp2 = __msa_dotp_u_h(vec2, filt0); - tmp3 = __msa_dotp_u_h(vec3, filt0); - - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp3, tmp2, dst); + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst); dst += dst_stride; - tmp0 = __msa_dotp_u_h(vec4, filt0); - tmp1 = __msa_dotp_u_h(vec5, filt0); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp1, tmp0, dst); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst); dst += dst_stride; - tmp2 = __msa_dotp_u_h(vec6, filt0); - tmp3 = __msa_dotp_u_h(vec7, filt0); - - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp3, tmp2, dst); + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst); dst += dst_stride; src0 = src4; @@ -563,93 +502,68 @@ static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; v8u16 tmp0, tmp1, tmp2, tmp3; - v8u16 filt; + v8i16 filt; /* rearranging filter_y */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); - src0 = LOAD_UB(src); - src5 = LOAD_UB(src + 16); + src0 = LD_UB(src); + src5 = LD_UB(src + 16); src += src_stride; for (loop_cnt = (height >> 2); loop_cnt--;) { - LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4); + LD_UB4(src, src_stride, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2); - - LOAD_4VECS_UB(src + 16, src_stride, src6, src7, src8, src9); + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); src += (4 * src_stride); - tmp0 = __msa_dotp_u_h(vec0, filt0); - tmp1 = __msa_dotp_u_h(vec1, filt0); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp1, tmp0, dst); - - tmp2 = __msa_dotp_u_h(vec2, filt0); - tmp3 = __msa_dotp_u_h(vec3, filt0); - - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride); - - ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6); - - tmp0 = __msa_dotp_u_h(vec4, filt0); - tmp1 = __msa_dotp_u_h(vec5, filt0); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 2 * dst_stride); - - tmp2 = __msa_dotp_u_h(vec6, filt0); - tmp3 = __msa_dotp_u_h(vec7, filt0); - - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 3 * dst_stride); - - ILV_B_LRLR_UB(src5, src6, src6, src7, vec1, vec0, vec3, vec2); - - tmp0 = __msa_dotp_u_h(vec0, filt0); - tmp1 = __msa_dotp_u_h(vec1, filt0); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16); - - tmp2 = __msa_dotp_u_h(vec2, filt0); - tmp3 = __msa_dotp_u_h(vec3, filt0); - - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + dst_stride); - - ILV_B_LRLR_UB(src7, src8, src8, src9, vec5, vec4, vec7, vec6); - - tmp0 = __msa_dotp_u_h(vec4, filt0); - tmp1 = __msa_dotp_u_h(vec5, filt0); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16 + 2 * dst_stride); - - tmp2 = __msa_dotp_u_h(vec6, filt0); - tmp3 = __msa_dotp_u_h(vec7, filt0); - - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + 3 * dst_stride); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); dst += (4 * dst_stride); src0 = src4; @@ -661,97 +575,72 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 src8, src9, src10, src11; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8u16 filt; + v8i16 filt; /* rearranging filter_y */ - filt = LOAD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); - LOAD_4VECS_UB(src, 16, src0, src3, src6, src9); + LD_UB4(src, 16, src0, src3, src6, src9); src += src_stride; for (loop_cnt = (height >> 1); loop_cnt--;) { - LOAD_2VECS_UB(src, src_stride, src1, src2); - LOAD_2VECS_UB(src + 16, src_stride, src4, src5); - LOAD_2VECS_UB(src + 32, src_stride, src7, src8); - LOAD_2VECS_UB(src + 48, src_stride, src10, src11); + LD_UB2(src, src_stride, src1, src2); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(src + 48, src_stride, src10, src11); src += (2 * src_stride); - ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2); - - tmp0 = __msa_dotp_u_h(vec0, filt0); - tmp1 = __msa_dotp_u_h(vec1, filt0); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp1, tmp0, dst); - - tmp2 = __msa_dotp_u_h(vec2, filt0); - tmp3 = __msa_dotp_u_h(vec3, filt0); - - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride); - - ILV_B_LRLR_UB(src3, src4, src4, src5, vec5, vec4, vec7, vec6); - - tmp4 = __msa_dotp_u_h(vec4, filt0); - tmp5 = __msa_dotp_u_h(vec5, filt0); - - tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7); - tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 16); - - tmp6 = __msa_dotp_u_h(vec6, filt0); - tmp7 = __msa_dotp_u_h(vec7, filt0); - - tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7); - tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 16 + dst_stride); - - ILV_B_LRLR_UB(src6, src7, src7, src8, vec1, vec0, vec3, vec2); - - tmp0 = __msa_dotp_u_h(vec0, filt0); - tmp1 = __msa_dotp_u_h(vec1, filt0); - - tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); - tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 32); - - tmp2 = __msa_dotp_u_h(vec2, filt0); - tmp3 = __msa_dotp_u_h(vec3, filt0); - - tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); - tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 32 + dst_stride); - - ILV_B_LRLR_UB(src9, src10, src10, src11, vec5, vec4, vec7, vec6); - - tmp4 = __msa_dotp_u_h(vec4, filt0); - tmp5 = __msa_dotp_u_h(vec5, filt0); - - tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7); - tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 48); - - tmp6 = __msa_dotp_u_h(vec6, filt0); - tmp7 = __msa_dotp_u_h(vec7, filt0); - - tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7); - tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7); - - PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 48 + dst_stride); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + SAT_UH2_UH(tmp4, tmp5, 7); + PCKEV_ST_SB(tmp4, tmp5, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + SAT_UH2_UH(tmp6, tmp7, 7); + PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + SAT_UH2_UH(tmp0, tmp1, 7); + PCKEV_ST_SB(tmp0, tmp1, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + SAT_UH2_UH(tmp2, tmp3, 7); + PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + SAT_UH2_UH(tmp4, tmp5, 7); + PCKEV_ST_SB(tmp4, tmp5, dst + 48); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + SAT_UH2_UH(tmp6, tmp7, 7); + PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); dst += (2 * dst_stride); src0 = src2; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c index 72b8ab71397..eb8776078b2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c @@ -19,46 +19,35 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride, if (0 == (height % 4)) { for (cnt = (height / 4); cnt--;) { - LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); + LD_UB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - LOAD_4VECS_UB(dst, dst_stride, dst0, dst1, dst2, dst3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - dst0 = __msa_aver_u_b(src0, dst0); - dst1 = __msa_aver_u_b(src1, dst1); - dst2 = __msa_aver_u_b(src2, dst2); - dst3 = __msa_aver_u_b(src3, dst3); + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); out0 = __msa_copy_u_w((v4i32)dst0, 0); out1 = __msa_copy_u_w((v4i32)dst1, 0); out2 = __msa_copy_u_w((v4i32)dst2, 0); out3 = __msa_copy_u_w((v4i32)dst3, 0); - - STORE_WORD(dst, out0); - dst += dst_stride; - STORE_WORD(dst, out1); - dst += dst_stride; - STORE_WORD(dst, out2); - dst += dst_stride; - STORE_WORD(dst, out3); - dst += dst_stride; + SW4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); } } else if (0 == (height % 2)) { for (cnt = (height / 2); cnt--;) { - LOAD_2VECS_UB(src, src_stride, src0, src1); + LD_UB2(src, src_stride, src0, src1); src += (2 * src_stride); - LOAD_2VECS_UB(dst, dst_stride, dst0, dst1); + LD_UB2(dst, dst_stride, dst0, dst1); - dst0 = __msa_aver_u_b(src0, dst0); - dst1 = __msa_aver_u_b(src1, dst1); + AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); out0 = __msa_copy_u_w((v4i32)dst0, 0); out1 = __msa_copy_u_w((v4i32)dst1, 0); - - STORE_WORD(dst, out0); + SW(out0, dst); dst += dst_stride; - STORE_WORD(dst, out1); + SW(out1, dst); dst += dst_stride; } } @@ -72,29 +61,19 @@ static void avg_width8_msa(const uint8_t *src, int32_t src_stride, v16u8 dst0, dst1, dst2, dst3; for (cnt = (height / 4); cnt--;) { - LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); + LD_UB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - LOAD_4VECS_UB(dst, dst_stride, dst0, dst1, dst2, dst3); - - dst0 = __msa_aver_u_b(src0, dst0); - dst1 = __msa_aver_u_b(src1, dst1); - dst2 = __msa_aver_u_b(src2, dst2); - dst3 = __msa_aver_u_b(src3, dst3); + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); out0 = __msa_copy_u_d((v2i64)dst0, 0); out1 = __msa_copy_u_d((v2i64)dst1, 0); out2 = __msa_copy_u_d((v2i64)dst2, 0); out3 = __msa_copy_u_d((v2i64)dst3, 0); - - STORE_DWORD(dst, out0); - dst += dst_stride; - STORE_DWORD(dst, out1); - dst += dst_stride; - STORE_DWORD(dst, out2); - dst += dst_stride; - STORE_DWORD(dst, out3); - dst += dst_stride; + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); } } @@ -105,24 +84,15 @@ static void avg_width16_msa(const uint8_t *src, int32_t src_stride, v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; for (cnt = (height / 8); cnt--;) { - LOAD_8VECS_UB(src, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - LOAD_8VECS_UB(dst, dst_stride, - dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - - dst0 = __msa_aver_u_b(src0, dst0); - dst1 = __msa_aver_u_b(src1, dst1); - dst2 = __msa_aver_u_b(src2, dst2); - dst3 = __msa_aver_u_b(src3, dst3); - dst4 = __msa_aver_u_b(src4, dst4); - dst5 = __msa_aver_u_b(src5, dst5); - dst6 = __msa_aver_u_b(src6, dst6); - dst7 = __msa_aver_u_b(src7, dst7); - - STORE_8VECS_UB(dst, dst_stride, - dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); dst += (8 * dst_stride); } } @@ -137,99 +107,34 @@ static void avg_width32_msa(const uint8_t *src, int32_t src_stride, v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; for (cnt = (height / 8); cnt--;) { - src0 = LOAD_UB(src); - src1 = LOAD_UB(src + 16); - src += src_stride; - src2 = LOAD_UB(src); - src3 = LOAD_UB(src + 16); - src += src_stride; - src4 = LOAD_UB(src); - src5 = LOAD_UB(src + 16); - src += src_stride; - src6 = LOAD_UB(src); - src7 = LOAD_UB(src + 16); - src += src_stride; - - dst0 = LOAD_UB(dst_dup); - dst1 = LOAD_UB(dst_dup + 16); - dst_dup += dst_stride; - dst2 = LOAD_UB(dst_dup); - dst3 = LOAD_UB(dst_dup + 16); - dst_dup += dst_stride; - dst4 = LOAD_UB(dst_dup); - dst5 = LOAD_UB(dst_dup + 16); - dst_dup += dst_stride; - dst6 = LOAD_UB(dst_dup); - dst7 = LOAD_UB(dst_dup + 16); - dst_dup += dst_stride; - - src8 = LOAD_UB(src); - src9 = LOAD_UB(src + 16); - src += src_stride; - src10 = LOAD_UB(src); - src11 = LOAD_UB(src + 16); - src += src_stride; - src12 = LOAD_UB(src); - src13 = LOAD_UB(src + 16); - src += src_stride; - src14 = LOAD_UB(src); - src15 = LOAD_UB(src + 16); - src += src_stride; - - dst8 = LOAD_UB(dst_dup); - dst9 = LOAD_UB(dst_dup + 16); - dst_dup += dst_stride; - dst10 = LOAD_UB(dst_dup); - dst11 = LOAD_UB(dst_dup + 16); - dst_dup += dst_stride; - dst12 = LOAD_UB(dst_dup); - dst13 = LOAD_UB(dst_dup + 16); - dst_dup += dst_stride; - dst14 = LOAD_UB(dst_dup); - dst15 = LOAD_UB(dst_dup + 16); - dst_dup += dst_stride; - - dst0 = __msa_aver_u_b(src0, dst0); - dst1 = __msa_aver_u_b(src1, dst1); - dst2 = __msa_aver_u_b(src2, dst2); - dst3 = __msa_aver_u_b(src3, dst3); - dst4 = __msa_aver_u_b(src4, dst4); - dst5 = __msa_aver_u_b(src5, dst5); - dst6 = __msa_aver_u_b(src6, dst6); - dst7 = __msa_aver_u_b(src7, dst7); - dst8 = __msa_aver_u_b(src8, dst8); - dst9 = __msa_aver_u_b(src9, dst9); - dst10 = __msa_aver_u_b(src10, dst10); - dst11 = __msa_aver_u_b(src11, dst11); - dst12 = __msa_aver_u_b(src12, dst12); - dst13 = __msa_aver_u_b(src13, dst13); - dst14 = __msa_aver_u_b(src14, dst14); - dst15 = __msa_aver_u_b(src15, dst15); - - STORE_UB(dst0, dst); - STORE_UB(dst1, dst + 16); - dst += dst_stride; - STORE_UB(dst2, dst); - STORE_UB(dst3, dst + 16); - dst += dst_stride; - STORE_UB(dst4, dst); - STORE_UB(dst5, dst + 16); - dst += dst_stride; - STORE_UB(dst6, dst); - STORE_UB(dst7, dst + 16); - dst += dst_stride; - STORE_UB(dst8, dst); - STORE_UB(dst9, dst + 16); - dst += dst_stride; - STORE_UB(dst10, dst); - STORE_UB(dst11, dst + 16); - dst += dst_stride; - STORE_UB(dst12, dst); - STORE_UB(dst13, dst + 16); - dst += dst_stride; - STORE_UB(dst14, dst); - STORE_UB(dst15, dst + 16); - dst += dst_stride; + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6); + LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7); + dst_dup += (4 * dst_stride); + LD_UB4(src, src_stride, src8, src10, src12, src14); + LD_UB4(src + 16, src_stride, src9, src11, src13, src15); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14); + LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15); + dst_dup += (4 * dst_stride); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, + dst8, dst9, dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, + dst12, dst13, dst14, dst15); + + ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); + ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); + dst += (4 * dst_stride); + ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride); + ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride); + dst += (4 * dst_stride); } } @@ -243,48 +148,40 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride, v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; for (cnt = (height / 4); cnt--;) { - LOAD_4VECS_UB(src, 16, src0, src1, src2, src3); + LD_UB4(src, 16, src0, src1, src2, src3); src += src_stride; - LOAD_4VECS_UB(src, 16, src4, src5, src6, src7); + LD_UB4(src, 16, src4, src5, src6, src7); src += src_stride; - LOAD_4VECS_UB(src, 16, src8, src9, src10, src11); + LD_UB4(src, 16, src8, src9, src10, src11); src += src_stride; - LOAD_4VECS_UB(src, 16, src12, src13, src14, src15); + LD_UB4(src, 16, src12, src13, src14, src15); src += src_stride; - LOAD_4VECS_UB(dst_dup, 16, dst0, dst1, dst2, dst3); + LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3); dst_dup += dst_stride; - LOAD_4VECS_UB(dst_dup, 16, dst4, dst5, dst6, dst7); + LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7); dst_dup += dst_stride; - LOAD_4VECS_UB(dst_dup, 16, dst8, dst9, dst10, dst11); + LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11); dst_dup += dst_stride; - LOAD_4VECS_UB(dst_dup, 16, dst12, dst13, dst14, dst15); + LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15); dst_dup += dst_stride; - dst0 = __msa_aver_u_b(src0, dst0); - dst1 = __msa_aver_u_b(src1, dst1); - dst2 = __msa_aver_u_b(src2, dst2); - dst3 = __msa_aver_u_b(src3, dst3); - dst4 = __msa_aver_u_b(src4, dst4); - dst5 = __msa_aver_u_b(src5, dst5); - dst6 = __msa_aver_u_b(src6, dst6); - dst7 = __msa_aver_u_b(src7, dst7); - dst8 = __msa_aver_u_b(src8, dst8); - dst9 = __msa_aver_u_b(src9, dst9); - dst10 = __msa_aver_u_b(src10, dst10); - dst11 = __msa_aver_u_b(src11, dst11); - dst12 = __msa_aver_u_b(src12, dst12); - dst13 = __msa_aver_u_b(src13, dst13); - dst14 = __msa_aver_u_b(src14, dst14); - dst15 = __msa_aver_u_b(src15, dst15); + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, + dst8, dst9, dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, + dst12, dst13, dst14, dst15); - STORE_4VECS_UB(dst, 16, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); dst += dst_stride; - STORE_4VECS_UB(dst, 16, dst4, dst5, dst6, dst7); + ST_UB4(dst4, dst5, dst6, dst7, dst, 16); dst += dst_stride; - STORE_4VECS_UB(dst, 16, dst8, dst9, dst10, dst11); + ST_UB4(dst8, dst9, dst10, dst11, dst, 16); dst += dst_stride; - STORE_4VECS_UB(dst, 16, dst12, dst13, dst14, dst15); + ST_UB4(dst12, dst13, dst14, dst15, dst, 16); dst += dst_stride; } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c index 064ba762fa0..7a292c5ce16 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c @@ -12,16 +12,14 @@ #include "vp9/common/mips/msa/vp9_macros_msa.h" static void copy_width8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int32_t height) { + uint8_t *dst, int32_t dst_stride, int32_t height) { int32_t cnt; uint64_t out0, out1, out2, out3, out4, out5, out6, out7; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; if (0 == height % 12) { for (cnt = (height / 12); cnt--;) { - LOAD_8VECS_UB(src, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); src += (8 * src_stride); out0 = __msa_copy_u_d((v2i64)src0, 0); @@ -33,44 +31,24 @@ static void copy_width8_msa(const uint8_t *src, int32_t src_stride, out6 = __msa_copy_u_d((v2i64)src6, 0); out7 = __msa_copy_u_d((v2i64)src7, 0); - STORE_DWORD(dst, out0); - dst += dst_stride; - STORE_DWORD(dst, out1); - dst += dst_stride; - STORE_DWORD(dst, out2); - dst += dst_stride; - STORE_DWORD(dst, out3); - dst += dst_stride; - STORE_DWORD(dst, out4); - dst += dst_stride; - STORE_DWORD(dst, out5); - dst += dst_stride; - STORE_DWORD(dst, out6); - dst += dst_stride; - STORE_DWORD(dst, out7); - dst += dst_stride; + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); - LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); + LD_UB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); out0 = __msa_copy_u_d((v2i64)src0, 0); out1 = __msa_copy_u_d((v2i64)src1, 0); out2 = __msa_copy_u_d((v2i64)src2, 0); out3 = __msa_copy_u_d((v2i64)src3, 0); - - STORE_DWORD(dst, out0); - dst += dst_stride; - STORE_DWORD(dst, out1); - dst += dst_stride; - STORE_DWORD(dst, out2); - dst += dst_stride; - STORE_DWORD(dst, out3); - dst += dst_stride; + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); } } else if (0 == height % 8) { for (cnt = height >> 3; cnt--;) { - LOAD_8VECS_UB(src, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); src += (8 * src_stride); out0 = __msa_copy_u_d((v2i64)src0, 0); @@ -82,53 +60,33 @@ static void copy_width8_msa(const uint8_t *src, int32_t src_stride, out6 = __msa_copy_u_d((v2i64)src6, 0); out7 = __msa_copy_u_d((v2i64)src7, 0); - STORE_DWORD(dst, out0); - dst += dst_stride; - STORE_DWORD(dst, out1); - dst += dst_stride; - STORE_DWORD(dst, out2); - dst += dst_stride; - STORE_DWORD(dst, out3); - dst += dst_stride; - STORE_DWORD(dst, out4); - dst += dst_stride; - STORE_DWORD(dst, out5); - dst += dst_stride; - STORE_DWORD(dst, out6); - dst += dst_stride; - STORE_DWORD(dst, out7); - dst += dst_stride; + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); } } else if (0 == height % 4) { for (cnt = (height / 4); cnt--;) { - LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); + LD_UB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - out0 = __msa_copy_u_d((v2i64)src0, 0); out1 = __msa_copy_u_d((v2i64)src1, 0); out2 = __msa_copy_u_d((v2i64)src2, 0); out3 = __msa_copy_u_d((v2i64)src3, 0); - STORE_DWORD(dst, out0); - dst += dst_stride; - STORE_DWORD(dst, out1); - dst += dst_stride; - STORE_DWORD(dst, out2); - dst += dst_stride; - STORE_DWORD(dst, out3); - dst += dst_stride; + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); } } else if (0 == height % 2) { for (cnt = (height / 2); cnt--;) { - LOAD_2VECS_UB(src, src_stride, src0, src1); + LD_UB2(src, src_stride, src0, src1); src += (2 * src_stride); - out0 = __msa_copy_u_d((v2i64)src0, 0); out1 = __msa_copy_u_d((v2i64)src1, 0); - STORE_DWORD(dst, out0); + SD(out0, dst); dst += dst_stride; - STORE_DWORD(dst, out1); + SD(out1, dst); dst += dst_stride; } } @@ -147,12 +105,12 @@ static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, dst_tmp = dst; for (loop_cnt = (height >> 3); loop_cnt--;) { - LOAD_8VECS_UB(src_tmp, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(src_tmp, src_stride, + src0, src1, src2, src3, src4, src5, src6, src7); src_tmp += (8 * src_stride); - STORE_8VECS_UB(dst_tmp, dst_stride, - src0, src1, src2, src3, src4, src5, src6, src7); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, + dst_tmp, dst_stride); dst_tmp += (8 * dst_stride); } @@ -162,90 +120,79 @@ static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, } static void copy_width16_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int32_t height) { + uint8_t *dst, int32_t dst_stride, int32_t height) { int32_t cnt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; if (0 == height % 12) { for (cnt = (height / 12); cnt--;) { - LOAD_8VECS_UB(src, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); src += (8 * src_stride); - - STORE_8VECS_UB(dst, dst_stride, - src0, src1, src2, src3, src4, src5, src6, src7); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); dst += (8 * dst_stride); - LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); + LD_UB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - - STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); dst += (4 * dst_stride); } } else if (0 == height % 8) { copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); } else if (0 == height % 4) { for (cnt = (height >> 2); cnt--;) { - LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); + LD_UB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); dst += (4 * dst_stride); } } } static void copy_width32_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int32_t height) { + uint8_t *dst, int32_t dst_stride, int32_t height) { int32_t cnt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; if (0 == height % 12) { for (cnt = (height / 12); cnt--;) { - LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); - LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7); + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); src += (4 * src_stride); - - STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3); - STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); dst += (4 * dst_stride); - LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); - LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7); + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); src += (4 * src_stride); - - STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3); - STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); dst += (4 * dst_stride); - LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); - LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7); + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); src += (4 * src_stride); - - STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3); - STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); dst += (4 * dst_stride); } } else if (0 == height % 8) { copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); } else if (0 == height % 4) { for (cnt = (height >> 2); cnt--;) { - LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); - LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7); + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); src += (4 * src_stride); - - STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3); - STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); dst += (4 * dst_stride); } } } static void copy_width64_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int32_t height) { + uint8_t *dst, int32_t dst_stride, int32_t height) { copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); } @@ -264,8 +211,8 @@ void vp9_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint32_t cnt, tmp; /* 1 word storage */ for (cnt = h; cnt--;) { - tmp = LOAD_WORD(src); - STORE_WORD(dst, tmp); + tmp = LW(src); + SW(tmp, dst); src += src_stride; dst += dst_stride; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h index b109a4014f8..40fe94d3be5 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h @@ -16,142 +16,104 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; -#define HORIZ_8TAP_FILT(src, mask0, mask1, mask2, mask3, \ - filt_h0, filt_h1, filt_h2, filt_h3) ({ \ - v8i16 vec0, vec1, vec2, vec3, horiz_out; \ - \ - vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src), (v16i8)(src)); \ - vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \ - vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src), (v16i8)(src)); \ - vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \ - vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src), (v16i8)(src)); \ - vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \ - vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src), (v16i8)(src)); \ - vec2 = __msa_dpadd_s_h(vec2, (v16i8)(filt_h3), (v16i8)vec3); \ - vec0 = __msa_adds_s_h(vec0, vec2); \ - horiz_out = SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \ - \ - horiz_out; \ +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \ + filt0, filt1, filt2, filt3) ({ \ + v8i16 tmp0, tmp1; \ + \ + tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \ + tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ + tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \ + tmp0 = __msa_adds_s_h(tmp0, tmp1); \ + \ + tmp0; \ }) -#define HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3, \ - filt_h0, filt_h1, filt_h2, filt_h3) ({ \ - v8i16 vec0, vec1, vec2, vec3, horiz_out; \ - \ - vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \ - vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \ - vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \ - vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \ - vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \ - vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \ - vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \ - vec2 = __msa_dpadd_s_h(vec2, ((v16i8)filt_h3), (v16i8)vec3); \ - vec0 = __msa_adds_s_h(vec0, vec2); \ - horiz_out = (v8i16)SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \ - \ - horiz_out; \ +#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \ + filt_h0, filt_h1, filt_h2, filt_h3) ({ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8i16 hz_out_m; \ + \ + VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \ + vec0_m, vec1_m, vec2_m, vec3_m); \ + hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \ + filt_h0, filt_h1, filt_h2, filt_h3); \ + \ + hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \ + hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + \ + hz_out_m; \ }) -#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \ - filt0, filt1, filt2, filt3) ({ \ - v8i16 tmp0, tmp1; \ - \ - tmp0 = __msa_dotp_s_h((v16i8)(vec0), (v16i8)(filt0)); \ - tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)(vec1), (v16i8)(filt1)); \ - tmp1 = __msa_dotp_s_h((v16i8)(vec2), (v16i8)(filt2)); \ - tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)(vec3), ((v16i8)filt3)); \ - tmp0 = __msa_adds_s_h(tmp0, tmp1); \ - \ - tmp0; \ -}) +#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, mask2, mask3, \ + filt0, filt1, filt2, filt3, \ + out0, out1) { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ + DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ + ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ +} + +#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ + mask0, mask1, mask2, mask3, \ + filt0, filt1, filt2, filt3, \ + out0, out1, out2, out3) { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ + res4_m, res5_m, res6_m, res7_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ + res4_m, res5_m, res6_m, res7_m); \ + ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ + res7_m, out0, out1, out2, out3); \ +} + +#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) { \ + v16u8 tmp_m; \ + \ + tmp_m = PCKEV_XORI128_UB(in1, in0); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ +} -#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ - mask0, mask1, mask2, mask3, \ - filt0, filt1, filt2, filt3, \ - out0, out1) { \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m; \ - \ - vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \ - vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src2)); \ - \ - res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \ - res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \ - \ - vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \ - vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src2)); \ - \ - res0_m = __msa_dpadd_s_h(res0_m, (filt1), (v16i8)vec2_m); \ - res1_m = __msa_dpadd_s_h(res1_m, (filt1), (v16i8)vec3_m); \ - \ - vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \ - vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src2)); \ - \ - res2_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec4_m); \ - res3_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec5_m); \ - \ - vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \ - vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src2)); \ - \ - res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt3), (v16i8)vec6_m); \ - res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt3), (v16i8)vec7_m); \ - \ - out0 = __msa_adds_s_h(res0_m, res2_m); \ - out1 = __msa_adds_s_h(res1_m, res3_m); \ +#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) { \ + v16u8 tmp_m; \ + \ + tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ } -#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ - mask0, mask1, mask2, mask3, \ - filt0, filt1, filt2, filt3, \ - out0, out1, out2, out3) { \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ - v8i16 vec4_m, vec5_m, vec6_m, vec7_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m; \ - v8i16 res4_m, res5_m, res6_m, res7_m; \ - \ - vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src0), (v16i8)(src0)); \ - vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src1)); \ - vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src2), (v16i8)(src2)); \ - vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src3)); \ - \ - res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \ - res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \ - res2_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt0)); \ - res3_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt0)); \ - \ - vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src0), (v16i8)(src0)); \ - vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src1)); \ - vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src2), (v16i8)(src2)); \ - vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src3)); \ - \ - res4_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt2)); \ - res5_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt2)); \ - res6_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt2)); \ - res7_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt2)); \ - \ - vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src0), (v16i8)(src0)); \ - vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src1)); \ - vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src2), (v16i8)(src2)); \ - vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src3)); \ - \ - res0_m = __msa_dpadd_s_h(res0_m, (v16i8)(filt1), (v16i8)vec4_m); \ - res1_m = __msa_dpadd_s_h(res1_m, (v16i8)(filt1), (v16i8)vec5_m); \ - res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt1), (v16i8)vec6_m); \ - res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt1), (v16i8)vec7_m); \ - \ - vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src0), (v16i8)(src0)); \ - vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src1)); \ - vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src2), (v16i8)(src2)); \ - vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src3)); \ - \ - res4_m = __msa_dpadd_s_h(res4_m, (v16i8)(filt3), (v16i8)vec4_m); \ - res5_m = __msa_dpadd_s_h(res5_m, (v16i8)(filt3), (v16i8)vec5_m); \ - res6_m = __msa_dpadd_s_h(res6_m, (v16i8)(filt3), (v16i8)vec6_m); \ - res7_m = __msa_dpadd_s_h(res7_m, (v16i8)(filt3), (v16i8)vec7_m); \ - \ - out0 = __msa_adds_s_h(res0_m, res4_m); \ - out1 = __msa_adds_s_h(res1_m, res5_m); \ - out2 = __msa_adds_s_h(res2_m, res6_m); \ - out3 = __msa_adds_s_h(res3_m, res7_m); \ +#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \ + pdst, stride) { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ + PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ } #endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */ diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c index e5c0eaa32f3..dd7ca35d3bd 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c @@ -9,265 +9,7 @@ */ #include <assert.h> - -#include "vpx_ports/mem.h" -#include "vp9/common/vp9_idct.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" - -#define SET_COSPI_PAIR(c0_h, c1_h) ({ \ - v8i16 out0, r0_m, r1_m; \ - \ - r0_m = __msa_fill_h(c0_h); \ - r1_m = __msa_fill_h(c1_h); \ - out0 = __msa_ilvev_h(r1_m, r0_m); \ - \ - out0; \ -}) - -#define DOTP_CONST_PAIR(reg0, reg1, const0, const1, out0, out1) { \ - v8i16 k0_m = __msa_fill_h(const0); \ - v8i16 s0_m, s1_m, s2_m, s3_m; \ - \ - s0_m = __msa_fill_h(const1); \ - k0_m = __msa_ilvev_h(s0_m, k0_m); \ - \ - s0_m = __msa_ilvl_h(-reg1, reg0); \ - s1_m = __msa_ilvr_h(-reg1, reg0); \ - s2_m = __msa_ilvl_h(reg0, reg1); \ - s3_m = __msa_ilvr_h(reg0, reg1); \ - s1_m = (v8i16)__msa_dotp_s_w(s1_m, k0_m); \ - s0_m = (v8i16)__msa_dotp_s_w(s0_m, k0_m); \ - s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS); \ - s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS); \ - out0 = __msa_pckev_h(s0_m, s1_m); \ - \ - s1_m = (v8i16)__msa_dotp_s_w(s3_m, k0_m); \ - s0_m = (v8i16)__msa_dotp_s_w(s2_m, k0_m); \ - s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS); \ - s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS); \ - out1 = __msa_pckev_h(s0_m, s1_m); \ -} - -#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) { \ - v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ - v8i16 madd_s0_m, madd_s1_m; \ - \ - ILV_H_LR_SH(m0, m1, madd_s1_m, madd_s0_m); \ - \ - DOTP_S_W_4VECS_SW(madd_s0_m, c0, madd_s1_m, c0, \ - madd_s0_m, c1, madd_s1_m, c1, \ - madd0_m, madd1_m, madd2_m, madd3_m); \ - \ - SRARI_W_4VECS_SW(madd0_m, madd1_m, madd2_m, madd3_m, \ - madd0_m, madd1_m, madd2_m, madd3_m, \ - DCT_CONST_BITS); \ - \ - PCKEV_H_2VECS_SH(madd1_m, madd0_m, madd3_m, madd2_m, \ - res0, res1); \ -} - -#define VP9_MADD_BF(inp0, inp1, inp2, inp3, \ - cst0, cst1, cst2, cst3, \ - out0, out1, out2, out3) { \ - v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v4i32 m4_m, m5_m; \ - \ - ILV_H_LRLR_SH(inp0, inp1, inp2, inp3, \ - madd_s1_m, madd_s0_m, madd_s3_m, madd_s2_m); \ - \ - DOTP_S_W_4VECS_SW(madd_s0_m, cst0, madd_s1_m, cst0, \ - madd_s2_m, cst2, madd_s3_m, cst2, \ - tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - \ - m4_m = tmp0_m + tmp2_m; \ - m5_m = tmp1_m + tmp3_m; \ - tmp3_m = tmp1_m - tmp3_m; \ - tmp2_m = tmp0_m - tmp2_m; \ - \ - SRARI_W_4VECS_SW(m4_m, m5_m, tmp2_m, tmp3_m, \ - m4_m, m5_m, tmp2_m, tmp3_m, \ - DCT_CONST_BITS); \ - \ - PCKEV_H_2VECS_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ - \ - DOTP_S_W_4VECS_SW(madd_s0_m, cst1, madd_s1_m, cst1, \ - madd_s2_m, cst3, madd_s3_m, cst3, \ - tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - \ - m4_m = tmp0_m + tmp2_m; \ - m5_m = tmp1_m + tmp3_m; \ - tmp3_m = tmp1_m - tmp3_m; \ - tmp2_m = tmp0_m - tmp2_m; \ - \ - SRARI_W_4VECS_SW(m4_m, m5_m, tmp2_m, tmp3_m, \ - m4_m, m5_m, tmp2_m, tmp3_m, \ - DCT_CONST_BITS); \ - \ - PCKEV_H_2VECS_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ -} - -#define TRANSPOSE8x8_H1(in0, in1, in2, in3, \ - in4, in5, in6, in7, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) { \ - v8i16 loc0_m, loc1_m; \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - loc0_m = __msa_ilvr_h((in6), (in4)); \ - loc1_m = __msa_ilvr_h((in7), (in5)); \ - tmp0_m = __msa_ilvr_h(loc1_m, loc0_m); \ - tmp1_m = __msa_ilvl_h(loc1_m, loc0_m); \ - \ - loc0_m = __msa_ilvl_h((in6), (in4)); \ - loc1_m = __msa_ilvl_h((in7), (in5)); \ - tmp2_m = __msa_ilvr_h(loc1_m, loc0_m); \ - tmp3_m = __msa_ilvl_h(loc1_m, loc0_m); \ - \ - loc0_m = __msa_ilvr_h((in2), (in0)); \ - loc1_m = __msa_ilvr_h((in3), (in1)); \ - tmp4_m = __msa_ilvr_h(loc1_m, loc0_m); \ - tmp5_m = __msa_ilvl_h(loc1_m, loc0_m); \ - \ - loc0_m = __msa_ilvl_h((in2), (in0)); \ - loc1_m = __msa_ilvl_h((in3), (in1)); \ - tmp6_m = __msa_ilvr_h(loc1_m, loc0_m); \ - tmp7_m = __msa_ilvl_h(loc1_m, loc0_m); \ - \ - out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ - out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ - out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ - out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ - out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ - out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ - out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ - out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ -} - -#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, \ - r8, r9, r10, r11, r12, r13, r14, r15, \ - out0, out1, out2, out3, out4, out5, out6, out7, \ - out8, out9, out10, out11, \ - out12, out13, out14, out15) { \ - v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ - v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ - v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ - v8i16 h8_m, h9_m, h10_m, h11_m; \ - v8i16 k0_m, k1_m, k2_m, k3_m; \ - \ - /* stage 1 */ \ - k0_m = SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ - k1_m = SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ - k2_m = SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ - k3_m = SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ - VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \ - g0_m, g1_m, g2_m, g3_m); \ - \ - k0_m = SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ - k1_m = SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ - k2_m = SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ - k3_m = SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ - VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \ - g4_m, g5_m, g6_m, g7_m); \ - \ - k0_m = SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ - k1_m = SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ - k2_m = SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ - k3_m = SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ - VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \ - g8_m, g9_m, g10_m, g11_m); \ - \ - k0_m = SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ - k1_m = SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ - k2_m = SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ - k3_m = SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ - VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \ - g12_m, g13_m, g14_m, g15_m); \ - \ - /* stage 2 */ \ - k0_m = SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ - k1_m = SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ - k2_m = SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ - VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \ - h0_m, h1_m, h2_m, h3_m); \ - \ - k0_m = SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ - k1_m = SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ - k2_m = SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ - VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \ - h4_m, h5_m, h6_m, h7_m); \ - \ - BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ - \ - BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \ - h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ - \ - /* stage 3 */ \ - BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ - \ - k0_m = SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ - k1_m = SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ - k2_m = SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ - VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \ - out4, out6, out5, out7); \ - VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \ - out12, out14, out13, out15); \ - \ - /* stage 4 */ \ - k0_m = SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ - k1_m = SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ - k2_m = SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ - k3_m = SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ - VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ - VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ - VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ - VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ -} - -#define VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, \ - in0, in1, in2, in3) { \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m; \ - v16u8 dest0_m, dest1_m, dest2_m, dest3_m; \ - v16i8 tmp0_m, tmp1_m; \ - v16i8 zero_m = { 0 }; \ - uint8_t *dst_m = (uint8_t *)(dest); \ - \ - LOAD_4VECS_UB(dst_m, (dest_stride), \ - dest0_m, dest1_m, dest2_m, dest3_m); \ - \ - res0_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest0_m); \ - res1_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest1_m); \ - res2_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest2_m); \ - res3_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest3_m); \ - \ - res0_m += (v8i16)(in0); \ - res1_m += (v8i16)(in1); \ - res2_m += (v8i16)(in2); \ - res3_m += (v8i16)(in3); \ - \ - res0_m = CLIP_UNSIGNED_CHAR_H(res0_m); \ - res1_m = CLIP_UNSIGNED_CHAR_H(res1_m); \ - res2_m = CLIP_UNSIGNED_CHAR_H(res2_m); \ - res3_m = CLIP_UNSIGNED_CHAR_H(res3_m); \ - \ - tmp0_m = __msa_pckev_b((v16i8)res1_m, (v16i8)res0_m); \ - tmp1_m = __msa_pckev_b((v16i8)res3_m, (v16i8)res2_m); \ - \ - out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ - out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ - out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ - out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ - \ - STORE_DWORD(dst_m, out0_m); \ - dst_m += (dest_stride); \ - STORE_DWORD(dst_m, out1_m); \ - dst_m += (dest_stride); \ - STORE_DWORD(dst_m, out2_m); \ - dst_m += (dest_stride); \ - STORE_DWORD(dst_m, out3_m); \ -} +#include "vp9/common/mips/msa/vp9_idct_msa.h" void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { v8i16 loc0, loc1, loc2, loc3; @@ -275,63 +17,38 @@ void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; v8i16 tmp5, tmp6, tmp7; - /* load left top 8x8 */ - LOAD_8VECS_SH(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - - /* load right top 8x8 */ - LOAD_8VECS_SH((input + 8), 16, - reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); - - /* transpose block */ - TRANSPOSE8x8_H1(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, - reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - - /* transpose block */ - TRANSPOSE8x8_H1(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, - reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); - - DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); - DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); - - loc0 = reg2 + reg10; - reg2 = reg2 - reg10; - loc1 = reg14 + reg6; - reg14 = reg14 - reg6; - - DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); - DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); - DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); - - reg14 = reg8 - reg12; - reg2 = reg8 + reg12; - reg10 = reg0 - reg4; - reg6 = reg0 + reg4; - - reg0 = reg2 - loc1; - reg2 = reg2 + loc1; - reg12 = reg14 - loc0; - reg14 = reg14 + loc0; - reg4 = reg6 - loc3; - reg6 = reg6 + loc3; - reg8 = reg10 - loc2; - reg10 = reg10 + loc2; + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8; + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + + TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, + reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, + reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); + SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4, + reg8); + ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6, + reg10); /* stage 2 */ - DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); - DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); reg9 = reg1 - loc2; reg1 = reg1 + loc2; reg7 = reg15 - loc3; reg15 = reg15 + loc3; - DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); - DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); - - reg13 = loc0 + reg5; - reg5 = loc0 - reg5; - reg3 = loc1 + reg11; - reg11 = loc1 - reg11; + VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); loc1 = reg15 + reg3; reg3 = reg15 - reg3; @@ -346,8 +63,8 @@ void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { tmp7 = loc1; reg0 = loc2; - DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); - DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); + VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); loc0 = reg9 + reg5; reg5 = reg9 - reg5; @@ -360,21 +77,15 @@ void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { loc2 = reg4 - loc0; tmp5 = loc1; - DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); - loc0 = reg8 + reg5; - loc1 = reg8 - reg5; - reg4 = reg10 + reg11; - reg9 = reg10 - reg11; reg10 = loc0; reg11 = loc1; - DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); - reg8 = reg12 + reg3; - reg5 = reg12 - reg3; - reg6 = reg14 + reg13; - reg7 = reg14 - reg13; reg13 = loc2; /* Transpose and store the output */ @@ -383,49 +94,36 @@ void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { reg3 = tmp7; /* transpose block */ - TRANSPOSE8x8_H1(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, - reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); - - STORE_8VECS_SH(output, 16, reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); + TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, + reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); + ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); /* transpose block */ - TRANSPOSE8x8_H1(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, - reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); - - STORE_8VECS_SH((output + 8), 16, - reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); + TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, + reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); + ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); } -void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest, - int32_t dest_stride) { +void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { v8i16 loc0, loc1, loc2, loc3; v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; v8i16 tmp5, tmp6, tmp7; /* load up 8x8 */ - LOAD_8VECS_SH(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8 * 16; /* load bottom 8x8 */ - LOAD_8VECS_SH((input + 8 * 16), 16, - reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); - DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); - DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); - - loc0 = reg2 + reg10; - reg2 = reg2 - reg10; - loc1 = reg14 + reg6; - reg14 = reg14 - reg6; - - DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); - DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); - DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); - - reg14 = reg8 - reg12; - reg2 = reg8 + reg12; - reg10 = reg0 - reg4; - reg6 = reg0 + reg4; + VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); reg0 = reg2 - loc1; reg2 = reg2 + loc1; @@ -437,21 +135,17 @@ void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest, reg10 = reg10 + loc2; /* stage 2 */ - DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); - DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); reg9 = reg1 - loc2; reg1 = reg1 + loc2; reg7 = reg15 - loc3; reg15 = reg15 + loc3; - DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); - DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); - - reg13 = loc0 + reg5; - reg5 = loc0 - reg5; - reg3 = loc1 + reg11; - reg11 = loc1 - reg11; + VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); loc1 = reg15 + reg3; reg3 = reg15 - reg3; @@ -466,8 +160,8 @@ void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest, tmp7 = loc1; reg0 = loc2; - DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); - DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); + VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); loc0 = reg9 + reg5; reg5 = reg9 - reg5; @@ -480,21 +174,14 @@ void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest, loc2 = reg4 - loc0; tmp5 = loc1; - DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); - loc0 = reg8 + reg5; - loc1 = reg8 - reg5; - reg4 = reg10 + reg11; - reg9 = reg10 - reg11; reg10 = loc0; reg11 = loc1; - DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); - - reg8 = reg12 + reg3; - reg5 = reg12 - reg3; - reg6 = reg14 + reg13; - reg7 = reg14 - reg13; + VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); reg13 = loc2; /* Transpose and store the output */ @@ -502,22 +189,21 @@ void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest, reg14 = tmp6; reg3 = tmp7; - SRARI_H_4VECS_SH(reg0, reg2, reg4, reg6, reg0, reg2, reg4, reg6, 6); - VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, - reg0, reg2, reg4, reg6); - SRARI_H_4VECS_SH(reg8, reg10, reg12, reg14, reg8, reg10, reg12, reg14, 6); - VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (4 * dest_stride)), - dest_stride, reg8, reg10, reg12, reg14); - SRARI_H_4VECS_SH(reg3, reg13, reg11, reg5, reg3, reg13, reg11, reg5, 6); - VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (8 * dest_stride)), - dest_stride, reg3, reg13, reg11, reg5); - SRARI_H_4VECS_SH(reg7, reg9, reg1, reg15, reg7, reg9, reg1, reg15, 6); - VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (12 * dest_stride)), - dest_stride, reg7, reg9, reg1, reg15); + SRARI_H4_SH(reg0, reg2, reg4, reg6, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6); + dst += (4 * dst_stride); + SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); + dst += (4 * dst_stride); + SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); + dst += (4 * dst_stride); + SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); } -void vp9_idct16x16_256_add_msa(const int16_t *input, uint8_t *dest, - int32_t dest_stride) { +void vp9_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { int32_t i; DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); int16_t *out = out_arr; @@ -531,13 +217,13 @@ void vp9_idct16x16_256_add_msa(const int16_t *input, uint8_t *dest, /* transform columns */ for (i = 0; i < 2; ++i) { /* process 8 * 16 block */ - vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dest + (i << 3)), - dest_stride); + vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); } } -void vp9_idct16x16_10_add_msa(const int16_t *input, uint8_t *dest, - int32_t dest_stride) { +void vp9_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { uint8_t i; DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); int16_t *out = out_arr; @@ -570,64 +256,38 @@ void vp9_idct16x16_10_add_msa(const int16_t *input, uint8_t *dest, /* transform columns */ for (i = 0; i < 2; ++i) { /* process 8 * 16 block */ - vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dest + (i << 3)), - dest_stride); + vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); } } -void vp9_idct16x16_1_add_msa(const int16_t *input, uint8_t *dest, - int32_t dest_stride) { +void vp9_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { uint8_t i; - int32_t const1; int16_t out; - v8i16 const2, res0, res1, res2, res3, res4, res5, res6, res7; - v16u8 dest0, dest1, dest2, dest3; - v16u8 tmp0, tmp1, tmp2, tmp3; - v16i8 zero = { 0 }; - - out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - const1 = ROUND_POWER_OF_TWO(out, 6); - - const2 = __msa_fill_h(const1); - - for (i = 0; i < 4; ++i) { - LOAD_4VECS_UB(dest, dest_stride, dest0, dest1, dest2, dest3); - - res0 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest0); - res1 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest1); - res2 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest2); - res3 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest3); - res4 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest0); - res5 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest1); - res6 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest2); - res7 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest3); - - res0 += const2; - res1 += const2; - res2 += const2; - res3 += const2; - res4 += const2; - res5 += const2; - res6 += const2; - res7 += const2; - - res0 = CLIP_UNSIGNED_CHAR_H(res0); - res1 = CLIP_UNSIGNED_CHAR_H(res1); - res2 = CLIP_UNSIGNED_CHAR_H(res2); - res3 = CLIP_UNSIGNED_CHAR_H(res3); - res4 = CLIP_UNSIGNED_CHAR_H(res4); - res5 = CLIP_UNSIGNED_CHAR_H(res5); - res6 = CLIP_UNSIGNED_CHAR_H(res6); - res7 = CLIP_UNSIGNED_CHAR_H(res7); - - tmp0 = (v16u8)__msa_pckev_b((v16i8)res4, (v16i8)res0); - tmp1 = (v16u8)__msa_pckev_b((v16i8)res5, (v16i8)res1); - tmp2 = (v16u8)__msa_pckev_b((v16i8)res6, (v16i8)res2); - tmp3 = (v16u8)__msa_pckev_b((v16i8)res7, (v16i8)res3); - - STORE_4VECS_UB(dest, dest_stride, tmp0, tmp1, tmp2, tmp3); - dest += (4 * dest_stride); + v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 4; i--;) { + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, + tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); } } @@ -636,15 +296,12 @@ static void vp9_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; /* load input data */ - LOAD_16VECS_SH(input, 8, - l0, l8, l1, l9, l2, l10, l3, l11, - l4, l12, l5, l13, l6, l14, l7, l15); - - TRANSPOSE8x8_H_SH(l0, l1, l2, l3, l4, l5, l6, l7, - l0, l1, l2, l3, l4, l5, l6, l7); - - TRANSPOSE8x8_H_SH(l8, l9, l10, l11, l12, l13, l14, l15, - l8, l9, l10, l11, l12, l13, l14, l15); + LD_SH16(input, 8, + l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, + l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, + l8, l9, l10, l11, l12, l13, l14, l15); /* ADST in horizontal */ VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, @@ -657,19 +314,16 @@ static void vp9_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { l13 = -r13; l15 = -r1; - TRANSPOSE8x8_H_SH(r0, l1, r12, l3, r6, r14, r10, r2, - l0, l1, l2, l3, l4, l5, l6, l7); - - STORE_8VECS_SH(output, 16, l0, l1, l2, l3, l4, l5, l6, l7); - - TRANSPOSE8x8_H_SH(r3, r11, r15, r7, r5, l13, r9, l15, - l8, l9, l10, l11, l12, l13, l14, l15); - - STORE_8VECS_SH((output + 8), 16, l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, + l0, l1, l2, l3, l4, l5, l6, l7); + ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); + TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, + l8, l9, l10, l11, l12, l13, l14, l15); + ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); } -static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest, - int32_t dest_stride) { +static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { v8i16 v0, v2, v4, v6, k0, k1, k2, k3; v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; v8i16 out0, out1, out2, out3, out4, out5, out6, out7; @@ -678,210 +332,163 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest, v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; v8i16 res0, res1, res2, res3, res4, res5, res6, res7; v8i16 res8, res9, res10, res11, res12, res13, res14, res15; - v16u8 dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7; - v16u8 dest8, dest9, dest10, dest11, dest12, dest13, dest14, dest15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; v16i8 zero = { 0 }; - r0 = LOAD_SH(input + 0 * 16); - r3 = LOAD_SH(input + 3 * 16); - r4 = LOAD_SH(input + 4 * 16); - r7 = LOAD_SH(input + 7 * 16); - r8 = LOAD_SH(input + 8 * 16); - r11 = LOAD_SH(input + 11 * 16); - r12 = LOAD_SH(input + 12 * 16); - r15 = LOAD_SH(input + 15 * 16); + r0 = LD_SH(input + 0 * 16); + r3 = LD_SH(input + 3 * 16); + r4 = LD_SH(input + 4 * 16); + r7 = LD_SH(input + 7 * 16); + r8 = LD_SH(input + 8 * 16); + r11 = LD_SH(input + 11 * 16); + r12 = LD_SH(input + 12 * 16); + r15 = LD_SH(input + 15 * 16); /* stage 1 */ - k0 = SET_COSPI_PAIR(cospi_1_64, cospi_31_64); - k1 = SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); - k2 = SET_COSPI_PAIR(cospi_17_64, cospi_15_64); - k3 = SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); + k0 = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); + k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); + k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); + k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); - - k0 = SET_COSPI_PAIR(cospi_9_64, cospi_23_64); - k1 = SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); - k2 = SET_COSPI_PAIR(cospi_25_64, cospi_7_64); - k3 = SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); + k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); + k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); + k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); + k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); - BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0); - - k0 = SET_COSPI_PAIR(cospi_4_64, cospi_28_64); - k1 = SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); - k2 = SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); + k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); - r1 = LOAD_SH(input + 1 * 16); - r2 = LOAD_SH(input + 2 * 16); - r5 = LOAD_SH(input + 5 * 16); - r6 = LOAD_SH(input + 6 * 16); - r9 = LOAD_SH(input + 9 * 16); - r10 = LOAD_SH(input + 10 * 16); - r13 = LOAD_SH(input + 13 * 16); - r14 = LOAD_SH(input + 14 * 16); - - k0 = SET_COSPI_PAIR(cospi_5_64, cospi_27_64); - k1 = SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); - k2 = SET_COSPI_PAIR(cospi_21_64, cospi_11_64); - k3 = SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); + r1 = LD_SH(input + 1 * 16); + r2 = LD_SH(input + 2 * 16); + r5 = LD_SH(input + 5 * 16); + r6 = LD_SH(input + 6 * 16); + r9 = LD_SH(input + 9 * 16); + r10 = LD_SH(input + 10 * 16); + r13 = LD_SH(input + 13 * 16); + r14 = LD_SH(input + 14 * 16); + + k0 = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); + k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); + k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); + k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7); - - k0 = SET_COSPI_PAIR(cospi_13_64, cospi_19_64); - k1 = SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); - k2 = SET_COSPI_PAIR(cospi_29_64, cospi_3_64); - k3 = SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); + k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); + k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); + k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); + k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15); - BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4); - BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10); out1 = -out1; - out0 = __msa_srari_h(out0, 6); - out1 = __msa_srari_h(out1, 6); - dest0 = LOAD_UB(dest + 0 * dest_stride); - dest1 = LOAD_UB(dest + 15 * dest_stride); - res0 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest0); - res1 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest1); - res0 += out0; - res1 += out1; - res0 = CLIP_UNSIGNED_CHAR_H(res0); - res1 = CLIP_UNSIGNED_CHAR_H(res1); - res0 = (v8i16)__msa_pckev_b((v16i8)res0, (v16i8)res0); - res1 = (v8i16)__msa_pckev_b((v16i8)res1, (v16i8)res1); - STORE_DWORD(dest, __msa_copy_u_d((v2i64)res0, 0)); - STORE_DWORD(dest + 15 * dest_stride, __msa_copy_u_d((v2i64)res1, 0)); - - k0 = SET_COSPI_PAIR(cospi_12_64, cospi_20_64); - k1 = SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); - k2 = SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); + SRARI_H2_SH(out0, out1, 6); + dst0 = LD_UB(dst + 0 * dst_stride); + dst1 = LD_UB(dst + 15 * dst_stride); + ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1); + ADD2(res0, out0, res1, out1, res0, res1); + CLIP_SH2_0_255(res0, res1); + PCKEV_B2_SH(res0, res0, res1, res1, res0, res1); + ST8x1_UB(res0, dst); + ST8x1_UB(res1, dst + 15 * dst_stride); + + k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); - BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); out8 = -out8; - out8 = __msa_srari_h(out8, 6); - out9 = __msa_srari_h(out9, 6); - dest8 = LOAD_UB(dest + 1 * dest_stride); - dest9 = LOAD_UB(dest + 14 * dest_stride); - res8 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest8); - res9 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest9); - res8 += out8; - res9 += out9; - res8 = CLIP_UNSIGNED_CHAR_H(res8); - res9 = CLIP_UNSIGNED_CHAR_H(res9); - res8 = (v8i16)__msa_pckev_b((v16i8)res8, (v16i8)res8); - res9 = (v8i16)__msa_pckev_b((v16i8)res9, (v16i8)res9); - STORE_DWORD(dest + dest_stride, __msa_copy_u_d((v2i64)res8, 0)); - STORE_DWORD(dest + 14 * dest_stride, __msa_copy_u_d((v2i64)res9, 0)); - - k0 = SET_COSPI_PAIR(cospi_8_64, cospi_24_64); - k1 = SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); - k2 = SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); + SRARI_H2_SH(out8, out9, 6); + dst8 = LD_UB(dst + 1 * dst_stride); + dst9 = LD_UB(dst + 14 * dst_stride); + ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9); + ADD2(res8, out8, res9, out9, res8, res9); + CLIP_SH2_0_255(res8, res9); + PCKEV_B2_SH(res8, res8, res9, res9, res8, res9); + ST8x1_UB(res8, dst + dst_stride); + ST8x1_UB(res9, dst + 14 * dst_stride); + + k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7); out4 = -out4; - out4 = __msa_srari_h(out4, 6); - out5 = __msa_srari_h(out5, 6); - dest4 = LOAD_UB(dest + 3 * dest_stride); - dest5 = LOAD_UB(dest + 12 * dest_stride); - res4 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest4); - res5 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest5); - res4 += out4; - res5 += out5; - res4 = CLIP_UNSIGNED_CHAR_H(res4); - res5 = CLIP_UNSIGNED_CHAR_H(res5); - res4 = (v8i16)__msa_pckev_b((v16i8)res4, (v16i8)res4); - res5 = (v8i16)__msa_pckev_b((v16i8)res5, (v16i8)res5); - STORE_DWORD(dest + 3 * dest_stride, __msa_copy_u_d((v2i64)res4, 0)); - STORE_DWORD(dest + 12 * dest_stride, __msa_copy_u_d((v2i64)res5, 0)); + SRARI_H2_SH(out4, out5, 6); + dst4 = LD_UB(dst + 3 * dst_stride); + dst5 = LD_UB(dst + 12 * dst_stride); + ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5); + ADD2(res4, out4, res5, out5, res4, res5); + CLIP_SH2_0_255(res4, res5); + PCKEV_B2_SH(res4, res4, res5, res5, res4, res5); + ST8x1_UB(res4, dst + 3 * dst_stride); + ST8x1_UB(res5, dst + 12 * dst_stride); VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); out13 = -out13; - out12 = __msa_srari_h(out12, 6); - out13 = __msa_srari_h(out13, 6); - dest12 = LOAD_UB(dest + 2 * dest_stride); - dest13 = LOAD_UB(dest + 13 * dest_stride); - res12 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest12); - res13 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest13); - res12 += out12; - res13 += out13; - res12 = CLIP_UNSIGNED_CHAR_H(res12); - res13 = CLIP_UNSIGNED_CHAR_H(res13); - res12 = (v8i16)__msa_pckev_b((v16i8)res12, (v16i8)res12); - res13 = (v8i16)__msa_pckev_b((v16i8)res13, (v16i8)res13); - STORE_DWORD(dest + 2 * dest_stride, __msa_copy_u_d((v2i64)res12, 0)); - STORE_DWORD(dest + 13 * dest_stride, __msa_copy_u_d((v2i64)res13, 0)); - - k0 = SET_COSPI_PAIR(cospi_16_64, cospi_16_64); - k3 = SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + SRARI_H2_SH(out12, out13, 6); + dst12 = LD_UB(dst + 2 * dst_stride); + dst13 = LD_UB(dst + 13 * dst_stride); + ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13); + ADD2(res12, out12, res13, out13, res12, res13); + CLIP_SH2_0_255(res12, res13); + PCKEV_B2_SH(res12, res12, res13, res13, res12, res13); + ST8x1_UB(res12, dst + 2 * dst_stride); + ST8x1_UB(res13, dst + 13 * dst_stride); + + k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7); - out6 = __msa_srari_h(out6, 6); - out7 = __msa_srari_h(out7, 6); - dest6 = LOAD_UB(dest + 4 * dest_stride); - dest7 = LOAD_UB(dest + 11 * dest_stride); - res6 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest6); - res7 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest7); - res6 += out6; - res7 += out7; - res6 = CLIP_UNSIGNED_CHAR_H(res6); - res7 = CLIP_UNSIGNED_CHAR_H(res7); - res6 = (v8i16)__msa_pckev_b((v16i8)res6, (v16i8)res6); - res7 = (v8i16)__msa_pckev_b((v16i8)res7, (v16i8)res7); - STORE_DWORD(dest + 4 * dest_stride, __msa_copy_u_d((v2i64)res6, 0)); - STORE_DWORD(dest + 11 * dest_stride, __msa_copy_u_d((v2i64)res7, 0)); + SRARI_H2_SH(out6, out7, 6); + dst6 = LD_UB(dst + 4 * dst_stride); + dst7 = LD_UB(dst + 11 * dst_stride); + ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7); + ADD2(res6, out6, res7, out7, res6, res7); + CLIP_SH2_0_255(res6, res7); + PCKEV_B2_SH(res6, res6, res7, res7, res6, res7); + ST8x1_UB(res6, dst + 4 * dst_stride); + ST8x1_UB(res7, dst + 11 * dst_stride); VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11); - out10 = __msa_srari_h(out10, 6); - out11 = __msa_srari_h(out11, 6); - dest10 = LOAD_UB(dest + 6 * dest_stride); - dest11 = LOAD_UB(dest + 9 * dest_stride); - res10 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest10); - res11 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest11); - res10 += out10; - res11 += out11; - res10 = CLIP_UNSIGNED_CHAR_H(res10); - res11 = CLIP_UNSIGNED_CHAR_H(res11); - res10 = (v8i16)__msa_pckev_b((v16i8)res10, (v16i8)res10); - res11 = (v8i16)__msa_pckev_b((v16i8)res11, (v16i8)res11); - STORE_DWORD(dest + 6 * dest_stride, __msa_copy_u_d((v2i64)res10, 0)); - STORE_DWORD(dest + 9 * dest_stride, __msa_copy_u_d((v2i64)res11, 0)); - - k1 = SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); - k2 = SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + SRARI_H2_SH(out10, out11, 6); + dst10 = LD_UB(dst + 6 * dst_stride); + dst11 = LD_UB(dst + 9 * dst_stride); + ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11); + ADD2(res10, out10, res11, out11, res10, res11); + CLIP_SH2_0_255(res10, res11); + PCKEV_B2_SH(res10, res10, res11, res11, res10, res11); + ST8x1_UB(res10, dst + 6 * dst_stride); + ST8x1_UB(res11, dst + 9 * dst_stride); + + k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); + k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3); - out2 = __msa_srari_h(out2, 6); - out3 = __msa_srari_h(out3, 6); - dest2 = LOAD_UB(dest + 7 * dest_stride); - dest3 = LOAD_UB(dest + 8 * dest_stride); - res2 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest2); - res3 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest3); - res2 += out2; - res3 += out3; - res2 = CLIP_UNSIGNED_CHAR_H(res2); - res3 = CLIP_UNSIGNED_CHAR_H(res3); - res2 = (v8i16)__msa_pckev_b((v16i8)res2, (v16i8)res2); - res3 = (v8i16)__msa_pckev_b((v16i8)res3, (v16i8)res3); - STORE_DWORD(dest + 7 * dest_stride, __msa_copy_u_d((v2i64)res2, 0)); - STORE_DWORD(dest + 8 * dest_stride, __msa_copy_u_d((v2i64)res3, 0)); + SRARI_H2_SH(out2, out3, 6); + dst2 = LD_UB(dst + 7 * dst_stride); + dst3 = LD_UB(dst + 8 * dst_stride); + ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3); + ADD2(res2, out2, res3, out3, res2, res3); + CLIP_SH2_0_255(res2, res3); + PCKEV_B2_SH(res2, res2, res3, res3, res2, res3); + ST8x1_UB(res2, dst + 7 * dst_stride); + ST8x1_UB(res3, dst + 8 * dst_stride); VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15); - out14 = __msa_srari_h(out14, 6); - out15 = __msa_srari_h(out15, 6); - dest14 = LOAD_UB(dest + 5 * dest_stride); - dest15 = LOAD_UB(dest + 10 * dest_stride); - res14 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest14); - res15 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest15); - res14 += out14; - res15 += out15; - res14 = CLIP_UNSIGNED_CHAR_H(res14); - res15 = CLIP_UNSIGNED_CHAR_H(res15); - res14 = (v8i16)__msa_pckev_b((v16i8)res14, (v16i8)res14); - res15 = (v8i16)__msa_pckev_b((v16i8)res15, (v16i8)res15); - STORE_DWORD(dest + 5 * dest_stride, __msa_copy_u_d((v2i64)res14, 0)); - STORE_DWORD(dest + 10 * dest_stride, __msa_copy_u_d((v2i64)res15, 0)); + SRARI_H2_SH(out14, out15, 6); + dst14 = LD_UB(dst + 5 * dst_stride); + dst15 = LD_UB(dst + 10 * dst_stride); + ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); + ADD2(res14, out14, res15, out15, res14, res15); + CLIP_SH2_0_255(res14, res15); + PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); + ST8x1_UB(res14, dst + 5 * dst_stride); + ST8x1_UB(res15, dst + 10 * dst_stride); } -void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dest, - int32_t dest_stride, int32_t tx_type) { +void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t tx_type) { int32_t i; DECLARE_ALIGNED(32, int16_t, out[16 * 16]); int16_t *out_ptr = &out[0]; @@ -897,8 +504,8 @@ void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dest, /* transform columns */ for (i = 0; i < 2; ++i) { /* process 8 * 16 block */ - vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), - (dest + (i << 3)), dest_stride); + vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); } break; case ADST_DCT: @@ -911,7 +518,7 @@ void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dest, /* transform columns */ for (i = 0; i < 2; ++i) { vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)), - (dest + (i << 3)), dest_stride); + (dst + (i << 3)), dst_stride); } break; case DCT_ADST: @@ -924,8 +531,8 @@ void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dest, /* transform columns */ for (i = 0; i < 2; ++i) { /* process 8 * 16 block */ - vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), - (dest + (i << 3)), dest_stride); + vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); } break; case ADST_ADST: @@ -938,7 +545,7 @@ void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dest, /* transform columns */ for (i = 0; i < 2; ++i) { vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)), - (dest + (i << 3)), dest_stride); + (dst + (i << 3)), dst_stride); } break; default: diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c index f576b50ea07..77d53a4c35f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c @@ -8,108 +8,34 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_ports/mem.h" -#include "vp9/common/vp9_idct.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" - -#define DOTP_CONST_PAIR(reg0, reg1, const0, const1, out0, out1) { \ - v8i16 k0_m = __msa_fill_h(const0); \ - v8i16 s0_m, s1_m, s2_m, s3_m; \ - \ - s0_m = __msa_fill_h(const1); \ - k0_m = __msa_ilvev_h(s0_m, k0_m); \ - \ - s0_m = __msa_ilvl_h(-reg1, reg0); \ - s1_m = __msa_ilvr_h(-reg1, reg0); \ - s2_m = __msa_ilvl_h(reg0, reg1); \ - s3_m = __msa_ilvr_h(reg0, reg1); \ - s1_m = (v8i16)__msa_dotp_s_w(s1_m, k0_m); \ - s0_m = (v8i16)__msa_dotp_s_w(s0_m, k0_m); \ - s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS); \ - s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS); \ - out0 = __msa_pckev_h(s0_m, s1_m); \ - \ - s1_m = (v8i16)__msa_dotp_s_w(s3_m, k0_m); \ - s0_m = (v8i16)__msa_dotp_s_w(s2_m, k0_m); \ - s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS); \ - s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS); \ - out1 = __msa_pckev_h(s0_m, s1_m); \ -} - -#define VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS(dest, dest_stride, \ - in0, in1, in2, in3) { \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m; \ - v16u8 dest0_m, dest1_m, dest2_m, dest3_m; \ - v16i8 tmp0_m, tmp1_m; \ - v16i8 zero_m = { 0 }; \ - uint8_t *dst_m = (uint8_t *)(dest); \ - \ - dest0_m = LOAD_UB(dst_m); \ - dest1_m = LOAD_UB(dst_m + 4 * dest_stride); \ - dest2_m = LOAD_UB(dst_m + 8 * dest_stride); \ - dest3_m = LOAD_UB(dst_m + 12 * dest_stride); \ - \ - res0_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest0_m); \ - res1_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest1_m); \ - res2_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest2_m); \ - res3_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest3_m); \ - \ - res0_m += (v8i16)(in0); \ - res1_m += (v8i16)(in1); \ - res2_m += (v8i16)(in2); \ - res3_m += (v8i16)(in3); \ - \ - res0_m = CLIP_UNSIGNED_CHAR_H(res0_m); \ - res1_m = CLIP_UNSIGNED_CHAR_H(res1_m); \ - res2_m = CLIP_UNSIGNED_CHAR_H(res2_m); \ - res3_m = CLIP_UNSIGNED_CHAR_H(res3_m); \ - \ - tmp0_m = __msa_pckev_b((v16i8)res1_m, (v16i8)res0_m); \ - tmp1_m = __msa_pckev_b((v16i8)res3_m, (v16i8)res2_m); \ - \ - out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ - out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ - out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ - out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ - \ - STORE_DWORD(dst_m, out0_m); \ - dst_m += (4 * dest_stride); \ - STORE_DWORD(dst_m, out1_m); \ - dst_m += (4 * dest_stride); \ - STORE_DWORD(dst_m, out2_m); \ - dst_m += (4 * dest_stride); \ - STORE_DWORD(dst_m, out3_m); \ -} +#include "vp9/common/mips/msa/vp9_idct_msa.h" static void vp9_idct32x8_row_transpose_store(const int16_t *input, int16_t *tmp_buf) { - v8i16 m0, m1, m2, m3, m4, m5, m6, m7; - v8i16 n0, n1, n2, n3, n4, n5, n6, n7; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; /* 1st & 2nd 8x8 */ - LOAD_8VECS_SH(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); - LOAD_8VECS_SH((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3, - m0, n0, m1, n1, m2, n2, m3, n3); - TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7, - m4, n4, m5, n5, m6, n6, m7, n7); - STORE_4VECS_SH((tmp_buf), 8, m0, n0, m1, n1); - STORE_4VECS_SH((tmp_buf + 4 * 8), 8, m2, n2, m3, n3); - STORE_4VECS_SH((tmp_buf + 8 * 8), 8, m4, n4, m5, n5); - STORE_4VECS_SH((tmp_buf + 12 * 8), 8, m6, n6, m7, n7); + LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, + m0, n0, m1, n1, m2, n2, m3, n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, + m4, n4, m5, n5, m6, n6, m7, n7); + ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); + ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); + ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); /* 3rd & 4th 8x8 */ - LOAD_8VECS_SH((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); - LOAD_8VECS_SH((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3, - m0, n0, m1, n1, m2, n2, m3, n3); - TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7, - m4, n4, m5, n5, m6, n6, m7, n7); - STORE_4VECS_SH((tmp_buf + 16 * 8), 8, m0, n0, m1, n1); - STORE_4VECS_SH((tmp_buf + 20 * 8), 8, m2, n2, m3, n3); - STORE_4VECS_SH((tmp_buf + 24 * 8), 8, m4, n4, m5, n5); - STORE_4VECS_SH((tmp_buf + 28 * 8), 8, m6, n6, m7, n7); + LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, + m0, n0, m1, n1, m2, n2, m3, n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, + m4, n4, m5, n5, m6, n6, m7, n7); + ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); + ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); + ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); + ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); } static void vp9_idct32x8_row_even_process_store(int16_t *tmp_buf, @@ -119,46 +45,28 @@ static void vp9_idct32x8_row_even_process_store(int16_t *tmp_buf, v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; /* Even stage 1 */ - LOAD_8VECS_SH(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); - DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); - - vec0 = reg1 - reg5; - vec1 = reg1 + reg5; - vec2 = reg7 - reg3; - vec3 = reg7 + reg3; - - DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); loc1 = vec3; loc0 = vec1; - DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); - DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); - - vec0 = reg4 - reg6; - vec1 = reg4 + reg6; - vec2 = reg0 - reg2; - vec3 = reg0 + reg2; - - stp4 = vec0 - loc0; - stp3 = vec0 + loc0; - stp7 = vec1 - loc1; - stp0 = vec1 + loc1; - stp5 = vec2 - loc2; - stp2 = vec2 + loc2; - stp6 = vec3 - loc3; - stp1 = vec3 + loc3; + VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); /* Even stage 2 */ - LOAD_8VECS_SH((tmp_buf + 16), 32, - reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - - DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); - DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); vec0 = reg0 + reg4; reg0 = reg0 - reg4; @@ -176,54 +84,42 @@ static void vp9_idct32x8_row_even_process_store(int16_t *tmp_buf, reg4 = reg5 - vec1; reg5 = reg5 + vec1; - DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); - DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); + VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + VP9_DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); vec0 = reg0 - reg6; reg0 = reg0 + reg6; vec1 = reg7 - reg1; reg7 = reg7 + reg1; - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); - DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ - loc0 = stp0 - reg5; - loc1 = stp0 + reg5; - loc2 = stp1 - reg7; - loc3 = stp1 + reg7; - STORE_SH(loc0, (tmp_eve_buf + 15 * 8)); - STORE_SH(loc1, (tmp_eve_buf)); - STORE_SH(loc2, (tmp_eve_buf + 14 * 8)); - STORE_SH(loc3, (tmp_eve_buf + 8)); - - loc0 = stp2 - reg1; - loc1 = stp2 + reg1; - loc2 = stp3 - reg4; - loc3 = stp3 + reg4; - STORE_SH(loc0, (tmp_eve_buf + 13 * 8)); - STORE_SH(loc1, (tmp_eve_buf + 2 * 8)); - STORE_SH(loc2, (tmp_eve_buf + 12 * 8)); - STORE_SH(loc3, (tmp_eve_buf + 3 * 8)); + BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 15 * 8)); + ST_SH(loc1, (tmp_eve_buf)); + ST_SH(loc2, (tmp_eve_buf + 14 * 8)); + ST_SH(loc3, (tmp_eve_buf + 8)); + + BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 13 * 8)); + ST_SH(loc1, (tmp_eve_buf + 2 * 8)); + ST_SH(loc2, (tmp_eve_buf + 12 * 8)); + ST_SH(loc3, (tmp_eve_buf + 3 * 8)); /* Store 8 */ - loc0 = stp4 - reg3; - loc1 = stp4 + reg3; - loc2 = stp5 - reg6; - loc3 = stp5 + reg6; - STORE_SH(loc0, (tmp_eve_buf + 11 * 8)); - STORE_SH(loc1, (tmp_eve_buf + 4 * 8)); - STORE_SH(loc2, (tmp_eve_buf + 10 * 8)); - STORE_SH(loc3, (tmp_eve_buf + 5 * 8)); - - loc0 = stp6 - reg0; - loc1 = stp6 + reg0; - loc2 = stp7 - reg2; - loc3 = stp7 + reg2; - STORE_SH(loc0, (tmp_eve_buf + 9 * 8)); - STORE_SH(loc1, (tmp_eve_buf + 6 * 8)); - STORE_SH(loc2, (tmp_eve_buf + 8 * 8)); - STORE_SH(loc3, (tmp_eve_buf + 7 * 8)); + BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 11 * 8)); + ST_SH(loc1, (tmp_eve_buf + 4 * 8)); + ST_SH(loc2, (tmp_eve_buf + 10 * 8)); + ST_SH(loc3, (tmp_eve_buf + 5 * 8)); + + BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 9 * 8)); + ST_SH(loc1, (tmp_eve_buf + 6 * 8)); + ST_SH(loc2, (tmp_eve_buf + 8 * 8)); + ST_SH(loc3, (tmp_eve_buf + 7 * 8)); } static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf, @@ -232,19 +128,19 @@ static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf, v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; /* Odd stage 1 */ - reg0 = LOAD_SH(tmp_buf + 8); - reg1 = LOAD_SH(tmp_buf + 7 * 8); - reg2 = LOAD_SH(tmp_buf + 9 * 8); - reg3 = LOAD_SH(tmp_buf + 15 * 8); - reg4 = LOAD_SH(tmp_buf + 17 * 8); - reg5 = LOAD_SH(tmp_buf + 23 * 8); - reg6 = LOAD_SH(tmp_buf + 25 * 8); - reg7 = LOAD_SH(tmp_buf + 31 * 8); - - DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); - DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + reg0 = LD_SH(tmp_buf + 8); + reg1 = LD_SH(tmp_buf + 7 * 8); + reg2 = LD_SH(tmp_buf + 9 * 8); + reg3 = LD_SH(tmp_buf + 15 * 8); + reg4 = LD_SH(tmp_buf + 17 * 8); + reg5 = LD_SH(tmp_buf + 23 * 8); + reg6 = LD_SH(tmp_buf + 25 * 8); + reg7 = LD_SH(tmp_buf + 31 * 8); + + VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); vec0 = reg0 + reg3; reg0 = reg0 - reg3; @@ -257,262 +153,192 @@ static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf, reg5 = vec0; /* 4 Stores */ - vec0 = reg5 + reg4; - vec1 = reg3 + reg2; - STORE_SH(vec0, (tmp_odd_buf + 4 * 8)); - STORE_SH(vec1, (tmp_odd_buf + 5 * 8)); + ADD2(reg5, reg4, reg3, reg2, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); - vec0 = reg5 - reg4; - vec1 = reg3 - reg2; - DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); - STORE_SH(vec0, (tmp_odd_buf)); - STORE_SH(vec1, (tmp_odd_buf + 8)); + SUB2(reg5, reg4, reg3, reg2, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf), 8); /* 4 Stores */ - DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); - DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); - - vec0 = reg0 + reg1; - vec2 = reg7 - reg6; - vec1 = reg7 + reg6; - vec3 = reg0 - reg1; - STORE_SH(vec0, (tmp_odd_buf + 6 * 8)); - STORE_SH(vec1, (tmp_odd_buf + 7 * 8)); + VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); - DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); - STORE_SH(vec2, (tmp_odd_buf + 2 * 8)); - STORE_SH(vec3, (tmp_odd_buf + 3 * 8)); + VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); /* Odd stage 2 */ - /* 8 loads */ - reg0 = LOAD_SH(tmp_buf + 3 * 8); - reg1 = LOAD_SH(tmp_buf + 5 * 8); - reg2 = LOAD_SH(tmp_buf + 11 * 8); - reg3 = LOAD_SH(tmp_buf + 13 * 8); - reg4 = LOAD_SH(tmp_buf + 19 * 8); - reg5 = LOAD_SH(tmp_buf + 21 * 8); - reg6 = LOAD_SH(tmp_buf + 27 * 8); - reg7 = LOAD_SH(tmp_buf + 29 * 8); - - DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); - DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); - DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); - DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + reg0 = LD_SH(tmp_buf + 3 * 8); + reg1 = LD_SH(tmp_buf + 5 * 8); + reg2 = LD_SH(tmp_buf + 11 * 8); + reg3 = LD_SH(tmp_buf + 13 * 8); + reg4 = LD_SH(tmp_buf + 19 * 8); + reg5 = LD_SH(tmp_buf + 21 * 8); + reg6 = LD_SH(tmp_buf + 27 * 8); + reg7 = LD_SH(tmp_buf + 29 * 8); + + VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); /* 4 Stores */ - vec0 = reg1 - reg2; - vec1 = reg6 - reg5; - vec2 = reg0 - reg3; - vec3 = reg7 - reg4; - DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); - DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); - - vec2 = loc2 - loc0; - vec3 = loc3 - loc1; - vec0 = loc2 + loc0; - vec1 = loc3 + loc1; - STORE_SH(vec0, (tmp_odd_buf + 12 * 8)); - STORE_SH(vec1, (tmp_odd_buf + 15 * 8)); + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, + vec0, vec1, vec2, vec3); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); - DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); - STORE_SH(vec0, (tmp_odd_buf + 10 * 8)); - STORE_SH(vec1, (tmp_odd_buf + 11 * 8)); + VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); /* 4 Stores */ - vec0 = reg0 + reg3; - vec1 = reg1 + reg2; - vec2 = reg6 + reg5; - vec3 = reg7 + reg4; - reg0 = vec0 + vec1; - reg1 = vec3 + vec2; - reg2 = vec0 - vec1; - reg3 = vec3 - vec2; - STORE_SH(reg0, (tmp_odd_buf + 13 * 8)); - STORE_SH(reg1, (tmp_odd_buf + 14 * 8)); - - DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, + vec1, vec2, vec0, vec3); + BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + ST_SH(reg0, (tmp_odd_buf + 13 * 8)); + ST_SH(reg1, (tmp_odd_buf + 14 * 8)); - STORE_SH(reg0, (tmp_odd_buf + 8 * 8)); - STORE_SH(reg1, (tmp_odd_buf + 9 * 8)); + VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ /* Load 8 & Store 8 */ - reg0 = LOAD_SH(tmp_odd_buf); - reg1 = LOAD_SH(tmp_odd_buf + 1 * 8); - reg2 = LOAD_SH(tmp_odd_buf + 2 * 8); - reg3 = LOAD_SH(tmp_odd_buf + 3 * 8); - reg4 = LOAD_SH(tmp_odd_buf + 8 * 8); - reg5 = LOAD_SH(tmp_odd_buf + 9 * 8); - reg6 = LOAD_SH(tmp_odd_buf + 10 * 8); - reg7 = LOAD_SH(tmp_odd_buf + 11 * 8); - - loc0 = reg0 + reg4; - loc1 = reg1 + reg5; - loc2 = reg2 + reg6; - loc3 = reg3 + reg7; - STORE_SH(loc0, (tmp_odd_buf)); - STORE_SH(loc1, (tmp_odd_buf + 1 * 8)); - STORE_SH(loc2, (tmp_odd_buf + 2 * 8)); - STORE_SH(loc3, (tmp_odd_buf + 3 * 8)); - - vec0 = reg0 - reg4; - vec1 = reg1 - reg5; - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - vec0 = reg2 - reg6; - vec1 = reg3 - reg7; - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - - STORE_SH(loc0, (tmp_odd_buf + 8 * 8)); - STORE_SH(loc1, (tmp_odd_buf + 9 * 8)); - STORE_SH(loc2, (tmp_odd_buf + 10 * 8)); - STORE_SH(loc3, (tmp_odd_buf + 11 * 8)); + LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); + LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, + loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); + + SUB2(reg0, reg4, reg1, reg5, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg2, reg6, reg3, reg7, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); /* Load 8 & Store 8 */ - reg1 = LOAD_SH(tmp_odd_buf + 4 * 8); - reg2 = LOAD_SH(tmp_odd_buf + 5 * 8); - reg0 = LOAD_SH(tmp_odd_buf + 6 * 8); - reg3 = LOAD_SH(tmp_odd_buf + 7 * 8); - reg4 = LOAD_SH(tmp_odd_buf + 12 * 8); - reg5 = LOAD_SH(tmp_odd_buf + 13 * 8); - reg6 = LOAD_SH(tmp_odd_buf + 14 * 8); - reg7 = LOAD_SH(tmp_odd_buf + 15 * 8); - - loc0 = reg0 + reg4; - loc1 = reg1 + reg5; - loc2 = reg2 + reg6; - loc3 = reg3 + reg7; - STORE_SH(loc0, (tmp_odd_buf + 4 * 8)); - STORE_SH(loc1, (tmp_odd_buf + 5 * 8)); - STORE_SH(loc2, (tmp_odd_buf + 6 * 8)); - STORE_SH(loc3, (tmp_odd_buf + 7 * 8)); - - vec0 = reg0 - reg4; - vec1 = reg3 - reg7; - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - vec0 = reg1 - reg5; - vec1 = reg2 - reg6; - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - - STORE_SH(loc0, (tmp_odd_buf + 12 * 8)); - STORE_SH(loc1, (tmp_odd_buf + 13 * 8)); - STORE_SH(loc2, (tmp_odd_buf + 14 * 8)); - STORE_SH(loc3, (tmp_odd_buf + 15 * 8)); + LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); + LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, + loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg0, reg4, reg3, reg7, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg1, reg5, reg2, reg6, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); } static void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf, int16_t *tmp_eve_buf, int16_t *tmp_odd_buf, - int16_t *dest) { + int16_t *dst) { v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 m0, m1, m2, m3, m4, m5, m6, m7; - v8i16 n0, n1, n2, n3, n4, n5, n6, n7; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; /* FINAL BUTTERFLY : Dependency on Even & Odd */ - /* Total: 32 loads, 32 stores */ - vec0 = LOAD_SH(tmp_odd_buf); - vec1 = LOAD_SH(tmp_odd_buf + 9 * 8); - vec2 = LOAD_SH(tmp_odd_buf + 14 * 8); - vec3 = LOAD_SH(tmp_odd_buf + 6 * 8); - loc0 = LOAD_SH(tmp_eve_buf); - loc1 = LOAD_SH(tmp_eve_buf + 8 * 8); - loc2 = LOAD_SH(tmp_eve_buf + 4 * 8); - loc3 = LOAD_SH(tmp_eve_buf + 12 * 8); - - m0 = (loc0 + vec3); - STORE_SH((loc0 - vec3), (tmp_buf + 31 * 8)); - STORE_SH((loc1 - vec2), (tmp_buf + 23 * 8)); - m4 = (loc1 + vec2); - STORE_SH((loc2 - vec1), (tmp_buf + 27 * 8)); - m2 = (loc2 + vec1); - STORE_SH((loc3 - vec0), (tmp_buf + 19 * 8)); - m6 = (loc3 + vec0); + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + + ST_SH((loc0 - vec3), (tmp_buf + 31 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 23 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 27 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 19 * 8)); /* Load 8 & Store 8 */ - vec0 = LOAD_SH(tmp_odd_buf + 4 * 8); - vec1 = LOAD_SH(tmp_odd_buf + 13 * 8); - vec2 = LOAD_SH(tmp_odd_buf + 10 * 8); - vec3 = LOAD_SH(tmp_odd_buf + 3 * 8); - loc0 = LOAD_SH(tmp_eve_buf + 2 * 8); - loc1 = LOAD_SH(tmp_eve_buf + 10 * 8); - loc2 = LOAD_SH(tmp_eve_buf + 6 * 8); - loc3 = LOAD_SH(tmp_eve_buf + 14 * 8); - - m1 = (loc0 + vec3); - STORE_SH((loc0 - vec3), (tmp_buf + 29 * 8)); - STORE_SH((loc1 - vec2), (tmp_buf + 21 * 8)); - m5 = (loc1 + vec2); - STORE_SH((loc2 - vec1), (tmp_buf + 25 * 8)); - m3 = (loc2 + vec1); - STORE_SH((loc3 - vec0), (tmp_buf + 17 * 8)); - m7 = (loc3 + vec0); + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + + ST_SH((loc0 - vec3), (tmp_buf + 29 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 21 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 25 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 17 * 8)); /* Load 8 & Store 8 */ - vec0 = LOAD_SH(tmp_odd_buf + 2 * 8); - vec1 = LOAD_SH(tmp_odd_buf + 11 * 8); - vec2 = LOAD_SH(tmp_odd_buf + 12 * 8); - vec3 = LOAD_SH(tmp_odd_buf + 7 * 8); - loc0 = LOAD_SH(tmp_eve_buf + 1 * 8); - loc1 = LOAD_SH(tmp_eve_buf + 9 * 8); - loc2 = LOAD_SH(tmp_eve_buf + 5 * 8); - loc3 = LOAD_SH(tmp_eve_buf + 13 * 8); - - n0 = (loc0 + vec3); - STORE_SH((loc0 - vec3), (tmp_buf + 30 * 8)); - STORE_SH((loc1 - vec2), (tmp_buf + 22 * 8)); - n4 = (loc1 + vec2); - STORE_SH((loc2 - vec1), (tmp_buf + 26 * 8)); - n2 = (loc2 + vec1); - STORE_SH((loc3 - vec0), (tmp_buf + 18 * 8)); - n6 = (loc3 + vec0); + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + + ST_SH((loc0 - vec3), (tmp_buf + 30 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 22 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 26 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 18 * 8)); /* Load 8 & Store 8 */ - vec0 = LOAD_SH(tmp_odd_buf + 5 * 8); - vec1 = LOAD_SH(tmp_odd_buf + 15 * 8); - vec2 = LOAD_SH(tmp_odd_buf + 8 * 8); - vec3 = LOAD_SH(tmp_odd_buf + 1 * 8); - loc0 = LOAD_SH(tmp_eve_buf + 3 * 8); - loc1 = LOAD_SH(tmp_eve_buf + 11 * 8); - loc2 = LOAD_SH(tmp_eve_buf + 7 * 8); - loc3 = LOAD_SH(tmp_eve_buf + 15 * 8); - - n1 = (loc0 + vec3); - STORE_SH((loc0 - vec3), (tmp_buf + 28 * 8)); - STORE_SH((loc1 - vec2), (tmp_buf + 20 * 8)); - n5 = (loc1 + vec2); - STORE_SH((loc2 - vec1), (tmp_buf + 24 * 8)); - n3 = (loc2 + vec1); - STORE_SH((loc3 - vec0), (tmp_buf + 16 * 8)); - n7 = (loc3 + vec0); + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + + ST_SH((loc0 - vec3), (tmp_buf + 28 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 20 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 24 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 16 * 8)); /* Transpose : 16 vectors */ /* 1st & 2nd 8x8 */ - TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3, - m0, n0, m1, n1, m2, n2, m3, n3); - STORE_4VECS_SH((dest + 0), 32, m0, n0, m1, n1); - STORE_4VECS_SH((dest + 4 * 32), 32, m2, n2, m3, n3); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, + m0, n0, m1, n1, m2, n2, m3, n3); + ST_SH4(m0, n0, m1, n1, (dst + 0), 32); + ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); - TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7, - m4, n4, m5, n5, m6, n6, m7, n7); - STORE_4VECS_SH((dest + 8), 32, m4, n4, m5, n5); - STORE_4VECS_SH((dest + 8 + 4 * 32), 32, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, + m4, n4, m5, n5, m6, n6, m7, n7); + ST_SH4(m4, n4, m5, n5, (dst + 8), 32); + ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); /* 3rd & 4th 8x8 */ - LOAD_8VECS_SH((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); - LOAD_8VECS_SH((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3, - m0, n0, m1, n1, m2, n2, m3, n3); - STORE_4VECS_SH((dest + 16), 32, m0, n0, m1, n1); - STORE_4VECS_SH((dest + 16 + 4 * 32), 32, m2, n2, m3, n3); - - TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7, - m4, n4, m5, n5, m6, n6, m7, n7); - STORE_4VECS_SH((dest + 24), 32, m4, n4, m5, n5); - STORE_4VECS_SH((dest + 24 + 4 * 32), 32, m6, n6, m7, n7); + LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, + m0, n0, m1, n1, m2, n2, m3, n3); + ST_SH4(m0, n0, m1, n1, (dst + 16), 32); + ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); + + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, + m4, n4, m5, n5, m6, n6, m7, n7); + ST_SH4(m4, n4, m5, n5, (dst + 24), 32); + ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); } static void vp9_idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { @@ -521,11 +347,8 @@ static void vp9_idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); vp9_idct32x8_row_transpose_store(input, &tmp_buf[0]); - vp9_idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); - vp9_idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); - vp9_idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], output); } @@ -537,48 +360,31 @@ static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf, v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; /* Even stage 1 */ - LOAD_8VECS_SH(tmp_buf, (4 * 32), - reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + tmp_buf += (2 * 32); - DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); - DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); - - vec0 = reg1 - reg5; - vec1 = reg1 + reg5; - vec2 = reg7 - reg3; - vec3 = reg7 + reg3; - - DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); loc1 = vec3; loc0 = vec1; - DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); - DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); - - vec0 = reg4 - reg6; - vec1 = reg4 + reg6; - vec2 = reg0 - reg2; - vec3 = reg0 + reg2; - - stp4 = vec0 - loc0; - stp3 = vec0 + loc0; - stp7 = vec1 - loc1; - stp0 = vec1 + loc1; - stp5 = vec2 - loc2; - stp2 = vec2 + loc2; - stp6 = vec3 - loc3; - stp1 = vec3 + loc3; + VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); /* Even stage 2 */ /* Load 8 */ - LOAD_8VECS_SH((tmp_buf + 2 * 32), (4 * 32), - reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); - DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); vec0 = reg0 + reg4; reg0 = reg0 - reg4; @@ -596,55 +402,35 @@ static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf, reg4 = reg5 - vec1; reg5 = reg5 + vec1; - DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); - DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); + VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + VP9_DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); vec0 = reg0 - reg6; reg0 = reg0 + reg6; vec1 = reg7 - reg1; reg7 = reg7 + reg1; - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); - DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ /* Store 8 */ - loc0 = stp0 - reg5; - loc1 = stp0 + reg5; - loc2 = stp1 - reg7; - loc3 = stp1 + reg7; - STORE_SH(loc0, (tmp_eve_buf + 15 * 8)); - STORE_SH(loc1, (tmp_eve_buf)); - STORE_SH(loc2, (tmp_eve_buf + 14 * 8)); - STORE_SH(loc3, (tmp_eve_buf + 1 * 8)); - - loc0 = stp2 - reg1; - loc1 = stp2 + reg1; - loc2 = stp3 - reg4; - loc3 = stp3 + reg4; - STORE_SH(loc0, (tmp_eve_buf + 13 * 8)); - STORE_SH(loc1, (tmp_eve_buf + 2 * 8)); - STORE_SH(loc2, (tmp_eve_buf + 12 * 8)); - STORE_SH(loc3, (tmp_eve_buf + 3 * 8)); + BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, tmp_eve_buf, 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8); + + BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8); /* Store 8 */ - loc0 = stp4 - reg3; - loc1 = stp4 + reg3; - loc2 = stp5 - reg6; - loc3 = stp5 + reg6; - STORE_SH(loc0, (tmp_eve_buf + 11 * 8)); - STORE_SH(loc1, (tmp_eve_buf + 4 * 8)); - STORE_SH(loc2, (tmp_eve_buf + 10 * 8)); - STORE_SH(loc3, (tmp_eve_buf + 5 * 8)); - - loc0 = stp6 - reg0; - loc1 = stp6 + reg0; - loc2 = stp7 - reg2; - loc3 = stp7 + reg2; - STORE_SH(loc0, (tmp_eve_buf + 9 * 8)); - STORE_SH(loc1, (tmp_eve_buf + 6 * 8)); - STORE_SH(loc2, (tmp_eve_buf + 8 * 8)); - STORE_SH(loc3, (tmp_eve_buf + 7 * 8)); + BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); + + BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); } static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf, @@ -653,19 +439,19 @@ static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf, v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; /* Odd stage 1 */ - reg0 = LOAD_SH(tmp_buf + 32); - reg1 = LOAD_SH(tmp_buf + 7 * 32); - reg2 = LOAD_SH(tmp_buf + 9 * 32); - reg3 = LOAD_SH(tmp_buf + 15 * 32); - reg4 = LOAD_SH(tmp_buf + 17 * 32); - reg5 = LOAD_SH(tmp_buf + 23 * 32); - reg6 = LOAD_SH(tmp_buf + 25 * 32); - reg7 = LOAD_SH(tmp_buf + 31 * 32); - - DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); - DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + reg0 = LD_SH(tmp_buf + 32); + reg1 = LD_SH(tmp_buf + 7 * 32); + reg2 = LD_SH(tmp_buf + 9 * 32); + reg3 = LD_SH(tmp_buf + 15 * 32); + reg4 = LD_SH(tmp_buf + 17 * 32); + reg5 = LD_SH(tmp_buf + 23 * 32); + reg6 = LD_SH(tmp_buf + 25 * 32); + reg7 = LD_SH(tmp_buf + 31 * 32); + + VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); vec0 = reg0 + reg3; reg0 = reg0 - reg3; @@ -678,278 +464,182 @@ static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf, reg5 = vec0; /* 4 Stores */ - vec0 = reg5 + reg4; - vec1 = reg3 + reg2; - STORE_SH(vec0, (tmp_odd_buf + 4 * 8)); - STORE_SH(vec1, (tmp_odd_buf + 5 * 8)); - - vec0 = reg5 - reg4; - vec1 = reg3 - reg2; - DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); - STORE_SH(vec0, (tmp_odd_buf)); - STORE_SH(vec1, (tmp_odd_buf + 1 * 8)); + ADD2(reg5, reg4, reg3, reg2, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); + SUB2(reg5, reg4, reg3, reg2, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + ST_SH2(vec0, vec1, tmp_odd_buf, 8); /* 4 Stores */ - DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); - DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); - - vec0 = reg0 + reg1; - vec2 = reg7 - reg6; - vec1 = reg7 + reg6; - vec3 = reg0 - reg1; - STORE_SH(vec0, (tmp_odd_buf + 6 * 8)); - STORE_SH(vec1, (tmp_odd_buf + 7 * 8)); - - DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); - STORE_SH(vec2, (tmp_odd_buf + 2 * 8)); - STORE_SH(vec3, (tmp_odd_buf + 3 * 8)); + VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); + VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); /* Odd stage 2 */ /* 8 loads */ - reg0 = LOAD_SH(tmp_buf + 3 * 32); - reg1 = LOAD_SH(tmp_buf + 5 * 32); - reg2 = LOAD_SH(tmp_buf + 11 * 32); - reg3 = LOAD_SH(tmp_buf + 13 * 32); - reg4 = LOAD_SH(tmp_buf + 19 * 32); - reg5 = LOAD_SH(tmp_buf + 21 * 32); - reg6 = LOAD_SH(tmp_buf + 27 * 32); - reg7 = LOAD_SH(tmp_buf + 29 * 32); - - DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); - DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); - DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); - DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + reg0 = LD_SH(tmp_buf + 3 * 32); + reg1 = LD_SH(tmp_buf + 5 * 32); + reg2 = LD_SH(tmp_buf + 11 * 32); + reg3 = LD_SH(tmp_buf + 13 * 32); + reg4 = LD_SH(tmp_buf + 19 * 32); + reg5 = LD_SH(tmp_buf + 21 * 32); + reg6 = LD_SH(tmp_buf + 27 * 32); + reg7 = LD_SH(tmp_buf + 29 * 32); + + VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); /* 4 Stores */ - vec0 = reg1 - reg2; - vec1 = reg6 - reg5; - vec2 = reg0 - reg3; - vec3 = reg7 - reg4; - DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); - DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); + ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); + VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); - vec2 = loc2 - loc0; - vec3 = loc3 - loc1; - vec0 = loc2 + loc0; - vec1 = loc3 + loc1; - STORE_SH(vec0, (tmp_odd_buf + 12 * 8)); - STORE_SH(vec1, (tmp_odd_buf + 15 * 8)); - - DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + /* 4 Stores */ + ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3); + BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8); + VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); - STORE_SH(vec0, (tmp_odd_buf + 10 * 8)); - STORE_SH(vec1, (tmp_odd_buf + 11 * 8)); + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + /* Load 8 & Store 8 */ + LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); + LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); - /* 4 Stores */ - vec0 = reg0 + reg3; - vec1 = reg1 + reg2; - vec2 = reg6 + reg5; - vec3 = reg7 + reg4; - reg0 = vec0 + vec1; - reg1 = vec3 + vec2; - reg2 = vec0 - vec1; - reg3 = vec3 - vec2; - STORE_SH(reg0, (tmp_odd_buf + 13 * 8)); - STORE_SH(reg1, (tmp_odd_buf + 14 * 8)); + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); - DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + SUB2(reg0, reg4, reg1, reg5, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - STORE_SH(reg0, (tmp_odd_buf + 8 * 8)); - STORE_SH(reg1, (tmp_odd_buf + 9 * 8)); + SUB2(reg2, reg6, reg3, reg7, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); - /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ /* Load 8 & Store 8 */ - reg0 = LOAD_SH(tmp_odd_buf); - reg1 = LOAD_SH(tmp_odd_buf + 1 * 8); - reg2 = LOAD_SH(tmp_odd_buf + 2 * 8); - reg3 = LOAD_SH(tmp_odd_buf + 3 * 8); - reg4 = LOAD_SH(tmp_odd_buf + 8 * 8); - reg5 = LOAD_SH(tmp_odd_buf + 9 * 8); - reg6 = LOAD_SH(tmp_odd_buf + 10 * 8); - reg7 = LOAD_SH(tmp_odd_buf + 11 * 8); - - loc0 = reg0 + reg4; - loc1 = reg1 + reg5; - loc2 = reg2 + reg6; - loc3 = reg3 + reg7; - STORE_SH(loc0, (tmp_odd_buf)); - STORE_SH(loc1, (tmp_odd_buf + 1 * 8)); - STORE_SH(loc2, (tmp_odd_buf + 2 * 8)); - STORE_SH(loc3, (tmp_odd_buf + 3 * 8)); - - vec0 = reg0 - reg4; - vec1 = reg1 - reg5; - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - vec0 = reg2 - reg6; - vec1 = reg3 - reg7; - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - - STORE_SH(loc0, (tmp_odd_buf + 8 * 8)); - STORE_SH(loc1, (tmp_odd_buf + 9 * 8)); - STORE_SH(loc2, (tmp_odd_buf + 10 * 8)); - STORE_SH(loc3, (tmp_odd_buf + 11 * 8)); + LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); + LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); - /* Load 8 & Store 8 */ - reg1 = LOAD_SH(tmp_odd_buf + 4 * 8); - reg2 = LOAD_SH(tmp_odd_buf + 5 * 8); - reg0 = LOAD_SH(tmp_odd_buf + 6 * 8); - reg3 = LOAD_SH(tmp_odd_buf + 7 * 8); - reg4 = LOAD_SH(tmp_odd_buf + 12 * 8); - reg5 = LOAD_SH(tmp_odd_buf + 13 * 8); - reg6 = LOAD_SH(tmp_odd_buf + 14 * 8); - reg7 = LOAD_SH(tmp_odd_buf + 15 * 8); - - loc0 = reg0 + reg4; - loc1 = reg1 + reg5; - loc2 = reg2 + reg6; - loc3 = reg3 + reg7; - STORE_SH(loc0, (tmp_odd_buf + 4 * 8)); - STORE_SH(loc1, (tmp_odd_buf + 5 * 8)); - STORE_SH(loc2, (tmp_odd_buf + 6 * 8)); - STORE_SH(loc3, (tmp_odd_buf + 7 * 8)); - - vec0 = reg0 - reg4; - vec1 = reg3 - reg7; - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - vec0 = reg1 - reg5; - vec1 = reg2 - reg6; - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - - STORE_SH(loc0, (tmp_odd_buf + 12 * 8)); - STORE_SH(loc1, (tmp_odd_buf + 13 * 8)); - STORE_SH(loc2, (tmp_odd_buf + 14 * 8)); - STORE_SH(loc3, (tmp_odd_buf + 15 * 8)); + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg0, reg4, reg3, reg7, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg1, reg5, reg2, reg6, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); } static void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, int16_t *tmp_odd_buf, - uint8_t *dest, - int32_t dest_stride) { + uint8_t *dst, + int32_t dst_stride) { v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 m0, m1, m2, m3, m4, m5, m6, m7; - v8i16 n0, n1, n2, n3, n4, n5, n6, n7; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; /* FINAL BUTTERFLY : Dependency on Even & Odd */ - vec0 = LOAD_SH(tmp_odd_buf); - vec1 = LOAD_SH(tmp_odd_buf + 9 * 8); - vec2 = LOAD_SH(tmp_odd_buf + 14 * 8); - vec3 = LOAD_SH(tmp_odd_buf + 6 * 8); - loc0 = LOAD_SH(tmp_eve_buf); - loc1 = LOAD_SH(tmp_eve_buf + 8 * 8); - loc2 = LOAD_SH(tmp_eve_buf + 4 * 8); - loc3 = LOAD_SH(tmp_eve_buf + 12 * 8); - - m0 = (loc0 + vec3); - m4 = (loc1 + vec2); - m2 = (loc2 + vec1); - m6 = (loc3 + vec0); - SRARI_H_4VECS_SH(m0, m2, m4, m6, m0, m2, m4, m6, 6); - VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS(dest, dest_stride, m0, m2, m4, m6); - - m6 = (loc0 - vec3); - m2 = (loc1 - vec2); - m4 = (loc2 - vec1); - m0 = (loc3 - vec0); - SRARI_H_4VECS_SH(m0, m2, m4, m6, m0, m2, m4, m6, 6); - VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 19 * dest_stride), - dest_stride, m0, m2, m4, m6); + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + SRARI_H4_SH(m0, m2, m4, m6, 6); + VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); + SRARI_H4_SH(m0, m2, m4, m6, 6); + VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), + m0, m2, m4, m6); /* Load 8 & Store 8 */ - vec0 = LOAD_SH(tmp_odd_buf + 4 * 8); - vec1 = LOAD_SH(tmp_odd_buf + 13 * 8); - vec2 = LOAD_SH(tmp_odd_buf + 10 * 8); - vec3 = LOAD_SH(tmp_odd_buf + 3 * 8); - loc0 = LOAD_SH(tmp_eve_buf + 2 * 8); - loc1 = LOAD_SH(tmp_eve_buf + 10 * 8); - loc2 = LOAD_SH(tmp_eve_buf + 6 * 8); - loc3 = LOAD_SH(tmp_eve_buf + 14 * 8); - - m1 = (loc0 + vec3); - m5 = (loc1 + vec2); - m3 = (loc2 + vec1); - m7 = (loc3 + vec0); - SRARI_H_4VECS_SH(m1, m3, m5, m7, m1, m3, m5, m7, 6); - VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 2 * dest_stride), - dest_stride, m1, m3, m5, m7); - - m7 = (loc0 - vec3); - m3 = (loc1 - vec2); - m5 = (loc2 - vec1); - m1 = (loc3 - vec0); - SRARI_H_4VECS_SH(m1, m3, m5, m7, m1, m3, m5, m7, 6); - VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 17 * dest_stride), - dest_stride, m1, m3, m5, m7); + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + SRARI_H4_SH(m1, m3, m5, m7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), + m1, m3, m5, m7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); + SRARI_H4_SH(m1, m3, m5, m7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), + m1, m3, m5, m7); /* Load 8 & Store 8 */ - vec0 = LOAD_SH(tmp_odd_buf + 2 * 8); - vec1 = LOAD_SH(tmp_odd_buf + 11 * 8); - vec2 = LOAD_SH(tmp_odd_buf + 12 * 8); - vec3 = LOAD_SH(tmp_odd_buf + 7 * 8); - loc0 = LOAD_SH(tmp_eve_buf + 1 * 8); - loc1 = LOAD_SH(tmp_eve_buf + 9 * 8); - loc2 = LOAD_SH(tmp_eve_buf + 5 * 8); - loc3 = LOAD_SH(tmp_eve_buf + 13 * 8); - - n0 = (loc0 + vec3); - n4 = (loc1 + vec2); - n2 = (loc2 + vec1); - n6 = (loc3 + vec0); - SRARI_H_4VECS_SH(n0, n2, n4, n6, n0, n2, n4, n6, 6); - VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 1 * dest_stride), - dest_stride, n0, n2, n4, n6); - - n6 = (loc0 - vec3); - n2 = (loc1 - vec2); - n4 = (loc2 - vec1); - n0 = (loc3 - vec0); - SRARI_H_4VECS_SH(n0, n2, n4, n6, n0, n2, n4, n6, 6); - VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 18 * dest_stride), - dest_stride, n0, n2, n4, n6); + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + SRARI_H4_SH(n0, n2, n4, n6, 6); + VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), + n0, n2, n4, n6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); + SRARI_H4_SH(n0, n2, n4, n6, 6); + VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), + n0, n2, n4, n6); /* Load 8 & Store 8 */ - vec0 = LOAD_SH(tmp_odd_buf + 5 * 8); - vec1 = LOAD_SH(tmp_odd_buf + 15 * 8); - vec2 = LOAD_SH(tmp_odd_buf + 8 * 8); - vec3 = LOAD_SH(tmp_odd_buf + 1 * 8); - loc0 = LOAD_SH(tmp_eve_buf + 3 * 8); - loc1 = LOAD_SH(tmp_eve_buf + 11 * 8); - loc2 = LOAD_SH(tmp_eve_buf + 7 * 8); - loc3 = LOAD_SH(tmp_eve_buf + 15 * 8); - - n1 = (loc0 + vec3); - n5 = (loc1 + vec2); - n3 = (loc2 + vec1); - n7 = (loc3 + vec0); - SRARI_H_4VECS_SH(n1, n3, n5, n7, n1, n3, n5, n7, 6); - VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 3 * dest_stride), - dest_stride, n1, n3, n5, n7); - - n7 = (loc0 - vec3); - n3 = (loc1 - vec2); - n5 = (loc2 - vec1); - n1 = (loc3 - vec0); - SRARI_H_4VECS_SH(n1, n3, n5, n7, n1, n3, n5, n7, 6); - VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 16 * dest_stride), - dest_stride, n1, n3, n5, n7); + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + SRARI_H4_SH(n1, n3, n5, n7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), + n1, n3, n5, n7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); + SRARI_H4_SH(n1, n3, n5, n7, 6); + VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), + n1, n3, n5, n7); } -static void vp9_idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dest, - int32_t dest_stride) { +static void vp9_idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); - vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); - vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], - dest, dest_stride); + dst, dst_stride); } -void vp9_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dest, - int32_t dest_stride) { +void vp9_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { int32_t i; DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); int16_t *out_ptr = out_arr; @@ -963,13 +653,13 @@ void vp9_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dest, /* transform columns */ for (i = 0; i < 4; ++i) { /* process 8 * 32 block */ - vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dest + (i << 3)), - dest_stride); + vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); } } -void vp9_idct32x32_34_add_msa(const int16_t *input, uint8_t *dest, - int32_t dest_stride) { +void vp9_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { int32_t i; DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); int16_t *out_ptr = out_arr; @@ -1008,70 +698,42 @@ void vp9_idct32x32_34_add_msa(const int16_t *input, uint8_t *dest, /* transform columns */ for (i = 0; i < 4; ++i) { /* process 8 * 32 block */ - vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dest + (i << 3)), - dest_stride); + vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); } } -void vp9_idct32x32_1_add_msa(const int16_t *input, uint8_t *dest, - int32_t dest_stride) { - int32_t i, const1; - v8i16 const2; +void vp9_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; int16_t out; - v8i16 res0, res1, res2, res3, res4, res5, res6, res7; - v16u8 dest0, dest1, dest2, dest3; - v16u8 tmp0, tmp1, tmp2, tmp3; - v16i8 zero = { 0 }; - - out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - const1 = ROUND_POWER_OF_TWO(out, 6); - - const2 = __msa_fill_h(const1); - - for (i = 0; i < 16; ++i) { - dest0 = LOAD_UB(dest); - dest1 = LOAD_UB(dest + 16); - dest2 = LOAD_UB(dest + dest_stride); - dest3 = LOAD_UB(dest + dest_stride + 16); - - res0 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest0); - res1 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest1); - res2 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest2); - res3 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest3); - res4 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest0); - res5 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest1); - res6 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest2); - res7 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest3); - - res0 += const2; - res1 += const2; - res2 += const2; - res3 += const2; - res4 += const2; - res5 += const2; - res6 += const2; - res7 += const2; - - res0 = CLIP_UNSIGNED_CHAR_H(res0); - res1 = CLIP_UNSIGNED_CHAR_H(res1); - res2 = CLIP_UNSIGNED_CHAR_H(res2); - res3 = CLIP_UNSIGNED_CHAR_H(res3); - res4 = CLIP_UNSIGNED_CHAR_H(res4); - res5 = CLIP_UNSIGNED_CHAR_H(res5); - res6 = CLIP_UNSIGNED_CHAR_H(res6); - res7 = CLIP_UNSIGNED_CHAR_H(res7); - - tmp0 = (v16u8)__msa_pckev_b((v16i8)res4, (v16i8)res0); - tmp1 = (v16u8)__msa_pckev_b((v16i8)res5, (v16i8)res1); - tmp2 = (v16u8)__msa_pckev_b((v16i8)res6, (v16i8)res2); - tmp3 = (v16u8)__msa_pckev_b((v16i8)res7, (v16i8)res3); - - STORE_UB(tmp0, dest); - STORE_UB(tmp1, dest + 16); - dest += dest_stride; - STORE_UB(tmp2, dest); - STORE_UB(tmp3, dest + 16); - dest += dest_stride; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 16; i--;) { + LD_UB2(dst, 16, dst0, dst1); + LD_UB2(dst + dst_stride, 16, dst2, dst3); + + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, + tmp0, tmp1, tmp2, tmp3); + + ST_UB2(tmp0, tmp1, dst, 16); + dst += dst_stride; + ST_UB2(tmp2, tmp3, dst, 16); + dst += dst_stride; } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c new file mode 100644 index 00000000000..a3df1a4d320 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include "vp9/common/mips/msa/vp9_idct_msa.h" + +void vp9_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3; + v4i32 in0_r, in1_r, in2_r, in3_r, in4_r; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in2, in3, in1); + TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); + UNPCK_R_SH_SW(in0, in0_r); + UNPCK_R_SH_SW(in2, in2_r); + UNPCK_R_SH_SW(in3, in3_r); + UNPCK_R_SH_SW(in1, in1_r); + SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT); + + in0_r += in2_r; + in3_r -= in1_r; + in4_r = (in0_r - in3_r) >> 1; + in1_r = in4_r - in1_r; + in2_r = in4_r - in2_r; + in0_r -= in1_r; + in3_r += in2_r; + + TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r); + + in0_r += in1_r; + in2_r -= in3_r; + in4_r = (in0_r - in2_r) >> 1; + in3_r = in4_r - in3_r; + in1_r = in4_r - in1_r; + in0_r -= in3_r; + in2_r += in1_r; + + PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, + in0, in1, in2, in3); + ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); +} + +void vp9_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t a1, e1; + v8i16 in1, in0 = { 0 }; + + a1 = input[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + + in0 = __msa_insert_h(in0, 0, a1); + in0 = __msa_insert_h(in0, 1, e1); + in0 = __msa_insert_h(in0, 2, e1); + in0 = __msa_insert_h(in0, 3, e1); + + in1 = in0 >> 1; + in0 -= in1; + + ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride); +} + +void vp9_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in1, in2, in3); + /* rows */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* columns */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* rounding (add 2^3, divide by 2^4) */ + SRARI_H4_SH(in0, in1, in2, in3, 4); + ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); +} + +void vp9_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t out; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 4); + vec = __msa_fill_h(out); + + ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride); +} + +void vp9_iht4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t tx_type) { + v8i16 in0, in1, in2, in3; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + switch (tx_type) { + case DCT_DCT: + /* DCT in horizontal */ + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* DCT in vertical */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_DCT: + /* DCT in horizontal */ + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* ADST in vertical */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case DCT_ADST: + /* ADST in horizontal */ + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* DCT in vertical */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_ADST: + /* ADST in horizontal */ + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* ADST in vertical */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + default: + assert(0); + break; + } + + /* final rounding (add 2^3, divide by 2^4) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 4); + /* add block and store 4x4 */ + ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c new file mode 100644 index 00000000000..a4a9c212401 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include "vp9/common/mips/msa/vp9_idct_msa.h" + +void vp9_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + + /* rows transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* 1D idct8x8 */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* columns transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* 1D idct8x8 */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + /* add block and store 8x8 */ + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +void vp9_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3; + v4i32 tmp0, tmp1, tmp2, tmp3; + v8i16 zero = { 0 }; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + /* stage1 */ + ILVL_H2_SH(in3, in0, in2, in1, s0, s1); + k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5); + + /* stage2 */ + ILVR_H2_SH(in3, in1, in2, in0, s1, s0); + k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3); + + /* stage3 */ + s0 = __msa_ilvr_h(s6, s5); + + k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1); + SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3); + + /* stage4 */ + BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + + /* add block and store 8x8 */ + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +void vp9_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t out; + int32_t val; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + val = ROUND_POWER_OF_TWO(out, 5); + vec = __msa_fill_h(val); + + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); +} + +void vp9_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride, int32_t tx_type) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + switch (tx_type) { + case DCT_DCT: + /* DCT in horizontal */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* DCT in vertical */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + case ADST_DCT: + /* DCT in horizontal */ + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* ADST in vertical */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + case DCT_ADST: + /* ADST in horizontal */ + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* DCT in vertical */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + case ADST_ADST: + /* ADST in horizontal */ + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + /* ADST in vertical */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + default: + assert(0); + break; + } + + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + + /* add block and store 8x8 */ + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct_msa.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct_msa.h new file mode 100644 index 00000000000..60e27fc1107 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct_msa.h @@ -0,0 +1,481 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_ +#define VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_ + +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \ + v8i16 k0_m = __msa_fill_h(cnst0); \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + s0_m = (v4i32)__msa_fill_h(cnst1); \ + k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ + \ + ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ + ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ + DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + \ + DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ +} + +#define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \ + dst0, dst1, dst2, dst3) { \ + v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ + v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ + \ + DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \ + tp0_m, tp2_m, tp3_m, tp4_m); \ + DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \ + tp5_m, tp6_m, tp7_m, tp8_m); \ + BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ + BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ + SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ + SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \ + dst0, dst1, dst2, dst3); \ +} + +#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \ + v8i16 dst_m; \ + v4i32 tp0_m, tp1_m; \ + \ + DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ + SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \ + dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ + \ + dst_m; \ +}) + +#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, \ + -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst1_m, cnst2_m, cnst3_m, in7, in0, \ + in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst1_m, cnst2_m, cnst3_m, in5, in2, \ + in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, \ + cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst2_m, cnst3_m, cnst1_m, out1, out6, \ + s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ +} + +#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) { \ + v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ + v8i16 madd_s0_m, madd_s1_m; \ + \ + ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \ + c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \ + SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ +} + +#define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ + out0, out1, out2, out3) { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ + cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ + m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ + cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ + m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ +} + +#define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({ \ + v8i16 out0_m, r0_m, r1_m; \ + \ + r0_m = __msa_fill_h(c0_h); \ + r1_m = __msa_fill_h(c1_h); \ + out0_m = __msa_ilvev_h(r1_m, r0_m); \ + \ + out0_m; \ +}) + +#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) { \ + uint8_t *dst_m = (uint8_t *) (dst); \ + v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ + v16i8 tmp0_m, tmp1_m; \ + v16i8 zero_m = { 0 }; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ + ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, \ + zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m); \ + ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, \ + res0_m, res1_m, res2_m, res3_m); \ + CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ + PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ +} + +#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v8i16 c0_m, c1_m, c2_m, c3_m; \ + v8i16 step0_m, step1_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + step0_m = __msa_ilvr_h(in2, in0); \ + DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + \ + c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + step1_m = __msa_ilvr_h(in3, in1); \ + DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + \ + PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ + SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ + BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, \ + (v8i16)tmp2_m, (v8i16)tmp3_m, \ + out0, out1, out2, out3); \ +} + +#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v8i16 res0_m, res1_m, c0_m, c1_m; \ + v8i16 k1_m, k2_m, k3_m, k4_m; \ + v8i16 zero_m = { 0 }; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v4i32 int0_m, int1_m, int2_m, int3_m; \ + v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, \ + sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, \ + -sinpi_4_9 }; \ + \ + SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ + ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ + int0_m = tmp2_m + tmp1_m; \ + \ + SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ + ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int1_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int2_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + c0_m = __msa_ilvev_h(c0_m, k1_m); \ + \ + res0_m = __msa_ilvr_h((in1), (in3)); \ + tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ + int3_m = tmp2_m + tmp0_m; \ + \ + res0_m = __msa_ilvr_h((in2), (in3)); \ + c1_m = __msa_ilvev_h(k4_m, k3_m); \ + \ + tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ + res1_m = __msa_ilvr_h((in0), (in2)); \ + c1_m = __msa_ilvev_h(k1_m, zero_m); \ + \ + tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ + int3_m += tmp2_m; \ + int3_m += tmp3_m; \ + \ + SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ + PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ +} + +#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({ \ + v8i16 c0_m, c1_m; \ + \ + SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ + c0_m = __msa_ilvev_h(c1_m, c0_m); \ + \ + c0_m; \ +}) + +/* multiply and add macro */ +#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ + out0, out1, out2, out3) { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ + DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, \ + cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ + DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, \ + cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \ +} + +/* idct 8x8 macro */ +#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ + cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ + \ + k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \ + k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \ + k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \ + k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \ + VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ + SUB2(in1, in3, in7, in5, res0_m, res1_m); \ + k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \ + k1_m = __msa_splati_h(mask_m, 4); \ + \ + ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ + DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + tp4_m = in1 + in3; \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ + tp7_m = in7 + in5; \ + k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, \ + in0, in4, in2, in6); \ + BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ + BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, \ + out0, out1, out2, out3, out4, out5, out6, out7); \ +} + +#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ + v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ + v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, \ + cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ + v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, \ + cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ + v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64, \ + -cospi_16_64, 0, 0, 0, 0 }; \ + \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \ + k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \ + ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + r0_m, r1_m, r2_m, r3_m); \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \ + ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + r4_m, r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \ + k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \ + ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + r0_m, r1_m, r2_m, r3_m); \ + k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \ + ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + r4_m, r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ + ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ + BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ + k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \ + ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + r0_m, r1_m, r2_m, r3_m); \ + k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ + r4_m, r5_m, r6_m, r7_m); \ + ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ + SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ + k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \ + k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \ + ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ + ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ + m0_m, m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ + \ + out1 = -in1; \ + out3 = -in3; \ + out5 = -in5; \ + out7 = -in7; \ +} + +#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, \ + r9, r10, r11, r12, r13, r14, r15, \ + out0, out1, out2, out3, out4, out5, \ + out6, out7, out8, out9, out10, out11, \ + out12, out13, out14, out15) { \ + v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ + v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ + v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ + v8i16 h8_m, h9_m, h10_m, h11_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m; \ + \ + /* stage 1 */ \ + k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ + VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \ + g0_m, g1_m, g2_m, g3_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ + VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \ + g4_m, g5_m, g6_m, g7_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ + VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \ + g8_m, g9_m, g10_m, g11_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ + k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ + VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \ + g12_m, g13_m, g14_m, g15_m); \ + \ + /* stage 2 */ \ + k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ + k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ + VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \ + h0_m, h1_m, h2_m, h3_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ + k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ + VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \ + h4_m, h5_m, h6_m, h7_m); \ + BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ + BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \ + h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ + \ + /* stage 3 */ \ + BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ + k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ + VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \ + out4, out6, out5, out7); \ + VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \ + out12, out14, out13, out15); \ + \ + /* stage 4 */ \ + k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ + k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ + VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ + VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ + VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ + VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ +} +#endif /* VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_ */ diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_intra_predict_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_intra_predict_msa.c new file mode 100644 index 00000000000..2fc610505a8 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_intra_predict_msa.c @@ -0,0 +1,737 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) { \ + out0 = __msa_subs_u_h(out0, in0); \ + out1 = __msa_subs_u_h(out1, in1); \ +} + +static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t src_data; + + src_data = LW(src); + + SW4(src_data, src_data, src_data, src_data, dst, dst_stride); +} + +static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint32_t src_data1, src_data2; + + src_data1 = LW(src); + src_data2 = LW(src + 4); + + for (row = 8; row--;) { + SW(src_data1, dst); + SW(src_data2, (dst + 4)); + dst += dst_stride; + } +} + +static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src0; + + src0 = LD_UB(src); + + for (row = 16; row--;) { + ST_UB(src0, dst); + dst += dst_stride; + } +} + +static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src1, src2; + + src1 = LD_UB(src); + src2 = LD_UB(src + 16); + + for (row = 32; row--;) { + ST_UB2(src1, src2, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t out0, out1, out2, out3; + + out0 = src[0] * 0x01010101; + out1 = src[1] * 0x01010101; + out2 = src[2] * 0x01010101; + out3 = src[3] * 0x01010101; + + SW4(out0, out1, out2, out3, dst, dst_stride); +} + +static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + out0 = src[0] * 0x0101010101010101ull; + out1 = src[1] * 0x0101010101010101ull; + out2 = src[2] * 0x0101010101010101ull; + out3 = src[3] * 0x0101010101010101ull; + out4 = src[4] * 0x0101010101010101ull; + out5 = src[5] * 0x0101010101010101ull; + out6 = src[6] * 0x0101010101010101ull; + out7 = src[7] * 0x0101010101010101ull; + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); +} + +static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 4; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 8; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB2(src0, src0, dst, 16); + dst += dst_stride; + ST_UB2(src1, src1, dst, 16); + dst += dst_stride; + ST_UB2(src2, src2, dst, 16); + dst += dst_stride; + ST_UB2(src3, src3, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint32_t val0, val1; + v16i8 store, src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LW(src_top); + val1 = LW(src_left); + INSERT_W2_SB(val0, val1, src); + sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0; + v16i8 store, data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + + val0 = LW(src); + data = (v16i8)__msa_insert_w((v4i32)data, 0, val0); + sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_w((v4i32)store, 0); + + SW4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint64_t val0, val1; + v16i8 store; + v16u8 src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src_top); + val1 = LD(src_left); + INSERT_D2_UB(val0, val1, src); + sum_h = __msa_hadd_u_h(src, src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0; + v16i8 store; + v16u8 data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src); + data = (v16u8)__msa_insert_d((v2i64)data, 0, val0); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { + uint64_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_d((v2i64)store, 0); + + SD4(out, out, out, out, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + v16u8 top, left, out; + v8u16 sum_h, sum_top, sum_left; + v4u32 sum_w; + v2u64 sum_d; + + top = LD_UB(src_top); + left = LD_UB(src_left); + HADD_UB2_UH(top, left, sum_top, sum_left); + sum_h = sum_top + sum_left; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + v16u8 data, out; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + data = LD_UB(src); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { + const v16u8 out = (v16u8)__msa_ldi_b(128); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint32_t row; + v16u8 top0, top1, left0, left1, out; + v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src_top, 16, top0, top1); + LD_UB2(src_left, 16, left0, left1); + HADD_UB2_UH(top0, top1, sum_top0, sum_top1); + HADD_UB2_UH(left0, left1, sum_left0, sum_left1); + sum_h = sum_top0 + sum_top1; + sum_h += sum_left0 + sum_left1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 data0, data1, out; + v8u16 sum_h, sum_data0, sum_data1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src, 16, data0, data1); + HADD_UB2_UH(data0, data1, sum_data0, sum_data1); + sum_h = sum_data0 + sum_data1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t row; + const v16u8 out = (v16u8)__msa_ldi_b(128); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint32_t val; + uint8_t top_left = src_top_ptr[-1]; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v16u8 src0, src1, src2, src3; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + + src_top_left = (v8u16)__msa_fill_h(top_left); + val = LW(src_top_ptr); + src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val); + + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); +} + +static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint64_t val; + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + v16u8 src0, src1, src2, src3; + + val = LD(src_top_ptr); + src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 2; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r, res_l; + + src_top = LD_SB(src_top_ptr); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 4; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVRL_B2_UH(src_left0, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left1, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left2, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left3, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + } +} + +static void intra_predict_tm_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint8_t top_left = src_top[-1]; + uint32_t loop_cnt; + v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; + + LD_SB2(src_top, 16, src_top0, src_top1); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 8; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + } +} + +void vp9_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_4x4_msa(above, dst, y_stride); +} + +void vp9_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_8x8_msa(above, dst, y_stride); +} + +void vp9_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_16x16_msa(above, dst, y_stride); +} + +void vp9_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_32x32_msa(above, dst, y_stride); +} + +void vp9_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_4x4_msa(left, dst, y_stride); +} + +void vp9_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_8x8_msa(left, dst, y_stride); +} + +void vp9_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_16x16_msa(left, dst, y_stride); +} + +void vp9_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_32x32_msa(left, dst, y_stride); +} + +void vp9_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_4x4_msa(above, left, dst, y_stride); +} + +void vp9_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_8x8_msa(above, left, dst, y_stride); +} + +void vp9_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_16x16_msa(above, left, dst, y_stride); +} + +void vp9_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_32x32_msa(above, left, dst, y_stride); +} + +void vp9_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_4x4_msa(above, dst, y_stride); +} + +void vp9_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_8x8_msa(above, dst, y_stride); +} + +void vp9_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_16x16_msa(above, dst, y_stride); +} + +void vp9_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_32x32_msa(above, dst, y_stride); +} + +void vp9_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_4x4_msa(left, dst, y_stride); +} + +void vp9_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_8x8_msa(left, dst, y_stride); +} + +void vp9_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_16x16_msa(left, dst, y_stride); +} + +void vp9_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_32x32_msa(left, dst, y_stride); +} + +void vp9_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_4x4_msa(dst, y_stride); +} + +void vp9_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_8x8_msa(dst, y_stride); +} + +void vp9_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_16x16_msa(dst, y_stride); +} + +void vp9_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_32x32_msa(dst, y_stride); +} + +void vp9_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_4x4_msa(above, left, dst, y_stride); +} + +void vp9_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_8x8_msa(above, left, dst, y_stride); +} + +void vp9_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_16x16_msa(above, left, dst, y_stride); +} + +void vp9_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_32x32_msa(above, left, dst, y_stride); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_16_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_16_msa.c new file mode 100644 index 00000000000..aeaa48e4e60 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_16_msa.c @@ -0,0 +1,1480 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_ports/mem.h" +#include "vp9/common/mips/msa/vp9_loopfilter_msa.h" + +int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, + uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, + q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +void vp9_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 96); + + LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + src -= 3 * pitch; + ST_UB4(p2, p1, p0, q0, src, pitch); + src += (4 * pitch); + ST_UB2(q1, q2, src, pitch); + } else { + src -= 7 * pitch; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, + zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += pitch; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += pitch; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4); + + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += pitch; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += pitch; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += pitch; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += pitch; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += pitch; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + } +} + +void vp9_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { + DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); + uint8_t early_exit = 0; + + (void)count; + + early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); + + if (0 == early_exit) { + vp9_hz_lpf_t16_16w(src, pitch, filter48); + } +} + +void vp9_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { + if (1 == count) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + uint64_t dword0, dword1; + v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 p0_filter16, p1_filter16; + v8i16 p2_filter8, p1_filter8, p0_filter8; + v8i16 q0_filter8, q1_filter8, q2_filter8; + v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + v8u16 tmp0, tmp1, tmp2; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); + } else { + /* convert 8 bit input data into 16 bit */ + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, + zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, + q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + /* load 16 vector elements */ + LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); + LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); + SD(q1_d, src + pitch); + SD(q2_d, src + 2 * pitch); + } else { + /* LSB(right) 8 pixel operation */ + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5, + zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r, + q7_r); + + tmp0 = p7_r << 3; + tmp0 -= p7_r; + tmp0 += p6_r; + tmp0 += q0_r; + + src -= 7 * pitch; + + /* calculation of p6 and p5 */ + tmp1 = p6_r + p5_r + p4_r + p3_r; + tmp1 += (p2_r + p1_r + p0_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp0 = p5_r - p6_r + q1_r - p7_r; + tmp1 += tmp0; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p4 and p3 */ + tmp0 = p4_r - p5_r + q2_r - p7_r; + tmp2 = p3_r - p4_r + q3_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p2 and p1 */ + tmp0 = p2_r - p3_r + q4_r - p7_r; + tmp2 = p1_r - p2_r + q5_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p0 and q0 */ + tmp0 = (p0_r - p1_r) + (q6_r - p7_r); + tmp2 = (q7_r - p0_r) + (q0_r - p7_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q1 and q2 */ + tmp0 = q7_r - q0_r + q1_r - p6_r; + tmp2 = q7_r - q1_r + q2_r - p5_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q3 and q4 */ + tmp0 = (q7_r - q2_r) + (q3_r - p4_r); + tmp2 = (q7_r - q3_r) + (q4_r - p3_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q5 and q6 */ + tmp0 = (q7_r - q4_r) + (q5_r - p2_r); + tmp2 = (q7_r - q5_r) + (q6_r - p1_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + } + } + } else { + vp9_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr, + thresh_ptr, count); + } +} + +static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; + v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, + p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org); + /* 8x8 transpose */ + TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, + p0_org, p7, p6, p5, p4, p3, p2, p1, p0); + /* 8x8 transpose */ + ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, + tmp0, tmp1, tmp2, tmp3); + ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); + ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); + ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); + ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); + SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); + TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, + q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); + ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); +} + +static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp2, tmp3; + + LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); + input += (8 * in_pitch); + LD_UB8(input, in_pitch, + row8, row9, row10, row11, row12, row13, row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, row12, row13, row14, row15, + p7, p6, p5, p4, p3, p2, p1, p0); + + /* transpose 16x8 matrix into 8x16 */ + /* total 8 intermediate register and 32 instructions */ + q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0); + q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1); + q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2); + q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3); + q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4); + q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5); + q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6); + q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7); + + ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); + tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7); + tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5); + + ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); + tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3); + tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1); + + ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); + q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0); + tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5); + q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); + q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4); + tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6); + q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch_org, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + /* convert 16 bit output data into 8 bit */ + p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r); + p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r); + p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r); + q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r); + q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r); + q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16i8 zero = { 0 }; + v16u8 filter8, flat, flat2; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 tmp0_r, tmp1_r; + v8i16 r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, + zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, + p3_r_in, p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST8x1_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST8x1_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST8x1_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST8x1_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST8x1_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST8x1_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST8x1_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST8x1_UB(q6, src); + + return 0; + } +} + +void vp9_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); + + early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), + &filter48[0], src, pitch, b_limit_ptr, + limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); + } + } +} + +int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src_org -= 2; + ST4x8_UB(vec2, vec3, src_org, pitch); + src_org += 8 * pitch; + ST4x8_UB(vec4, vec5, src_org, pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, + zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, + p3_r_in, p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)(tmp1_l), 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + + return 0; + } +} + +void vp9_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16); + + early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), + &filter48[0], src, pitch, b_limit_ptr, + limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch); + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_4_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_4_msa.c new file mode 100644 index 00000000000..7f691355a0f --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_4_msa.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/mips/msa/vp9_loopfilter_msa.h" + +void vp9_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { + uint64_t p1_d, p0_d, q0_d, q1_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; + + (void)count; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); +} + +void vp9_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, + hev, mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + + ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); +} + +void vp9_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { + v16u8 mask, hev, flat, limit, thresh, b_limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v8i16 vec0, vec1, vec2, vec3; + + (void)count; + + LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, + p3, p2, p1, p0, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); +} + +void vp9_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat; + v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src - 4 + (8 * pitch), pitch, + row8, row9, row10, row11, row12, row13, row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, row12, row13, row14, row15, + p3, p2, p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, + hev, mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); + ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); + + src -= 2; + + ST4x8_UB(tmp2, tmp3, src, pitch); + src += (8 * pitch); + ST4x8_UB(tmp4, tmp5, src, pitch); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_8_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_8_msa.c new file mode 100644 index 00000000000..26a858d6e62 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_8_msa.c @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/mips/msa/vp9_loopfilter_msa.h" + +void vp9_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + + (void)count; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, + q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, + zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, + q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + src -= 3 * pitch; + + SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); + src += (4 * pitch); + SD(q1_d, src); + src += pitch; + SD(q2_d, src); + } +} + +void vp9_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *b_limit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + tmp = (v16u8)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + tmp = (v16u8)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + tmp = (v16u8)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, + q2_r, q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + src -= 3 * pitch; + + ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); + src += (4 * pitch); + ST_UB2(q1_out, q2_out, src, pitch); + src += (2 * pitch); + } +} + +void vp9_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4; + + (void)count; + + /* load vector elements */ + LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, + p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + /* Store 4 pixels p1-_q1 */ + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, + p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + /* Store 6 pixels p2-_q2 */ + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src -= 3; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 4, src + 4, pitch); + } +} + +void vp9_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *b_limit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v16u8 row4, row5, row6, row7, row12, row13, row14, row15; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + temp_src = src - 4; + + LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); + + /* transpose 16x8 matrix into 8x16 */ + TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, + q3, q2, q1, q0, row12, row13, row14, row15, + p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + vec0 = (v8i16)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + vec0 = (v8i16)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + vec0 = (v8i16)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, + hev, mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src -= 2; + ST4x8_UB(vec2, vec3, src, pitch); + src += 8 * pitch; + ST4x8_UB(vec4, vec5, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + + /* filter8 */ + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 4, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 4, src + 4, pitch); + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_msa.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_msa.h new file mode 100644 index 00000000000..0643e41a520 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_loopfilter_msa.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_ +#define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_ + +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt = filt & (v16i8)hev_in; \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + /* combine left and right part */ \ + filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ + \ + filt = filt & (v16i8)mask_in; \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ +} + +#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + \ + filt = filt & (v16i8)hev_in; \ + \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ + filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ + filt_l += q0_sub_p0_l; \ + filt_l = __msa_sat_s_h(filt_l, 7); \ + \ + filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ + filt = filt & (v16i8)mask_in; \ + \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ +} + +#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) { \ + v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ + v16u8 zero_in = { 0 }; \ + \ + tmp = __msa_ori_b(zero_in, 1); \ + p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ + q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ + p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ + q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ + \ + p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ + flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ + p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ + flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ + \ + flat_out = (tmp < (v16u8)flat_out); \ + flat_out = __msa_xori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ +} + +#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \ + q5_in, q6_in, q7_in, flat_in, flat2_out) { \ + v16u8 tmp, zero_in = { 0 }; \ + v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ + v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ + \ + tmp = __msa_ori_b(zero_in, 1); \ + p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ + q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ + p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ + q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ + p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ + q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ + p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ + q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ + \ + p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ + flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ + flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ + p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ + flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ + p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ + flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ + \ + flat2_out = (tmp < (v16u8)flat2_out); \ + flat2_out = __msa_xori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ +} + +#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \ + q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, \ + q0_filt8_out, q1_filt8_out, q2_filt8_out) { \ + v8u16 tmp0, tmp1, tmp2; \ + \ + tmp2 = p2_in + p1_in + p0_in; \ + tmp0 = p3_in << 1; \ + \ + tmp0 = tmp0 + tmp2 + q0_in; \ + tmp1 = tmp0 + p3_in + p2_in; \ + p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + \ + tmp1 = tmp0 + p1_in + q1_in; \ + p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + \ + tmp1 = q2_in + q1_in + q0_in; \ + tmp2 = tmp2 + tmp1; \ + tmp0 = tmp2 + (p0_in); \ + tmp0 = tmp0 + (p3_in); \ + p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3); \ + \ + tmp0 = q2_in + q3_in; \ + tmp0 = p0_in + tmp1 + tmp0; \ + tmp1 = q3_in + q3_in; \ + tmp1 = tmp1 + tmp0; \ + q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + \ + tmp0 = tmp2 + q3_in; \ + tmp1 = tmp0 + q0_in; \ + q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + \ + tmp1 = tmp0 - p2_in; \ + tmp0 = q1_in + q3_in; \ + tmp1 = tmp0 + tmp1; \ + q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ +} + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \ + q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, \ + hev_out, mask_out, flat_out) { \ + v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ + p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ + p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ + q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ + q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ + q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ + p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ + p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = thresh_in < (v16u8)flat_out; \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m >>= 1; \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ + \ + mask_out = b_limit_in < p0_asub_q0_m; \ + mask_out = __msa_max_u_b(flat_out, mask_out); \ + p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ + \ + mask_out = limit_in < (v16u8)mask_out; \ + mask_out = __msa_xori_b(mask_out, 0xff); \ +} +#endif /* VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_ */ diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h index d7aabbb8898..43850758c24 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h @@ -16,852 +16,1971 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -#if HAVE_MSA -/* load macros */ -#define LOAD_UB(psrc) *((const v16u8 *)(psrc)) -#define LOAD_SB(psrc) *((const v16i8 *)(psrc)) -#define LOAD_UH(psrc) *((const v8u16 *)(psrc)) -#define LOAD_SH(psrc) *((const v8i16 *)(psrc)) -#define LOAD_UW(psrc) *((const v4u32 *)(psrc)) -#define LOAD_SW(psrc) *((const v4i32 *)(psrc)) -#define LOAD_UD(psrc) *((const v2u64 *)(psrc)) -#define LOAD_SD(psrc) *((const v2i64 *)(psrc)) - -/* store macros */ -#define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec) -#define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec) -#define STORE_UH(vec, pdest) *((v8u16 *)(pdest)) = (vec) -#define STORE_SH(vec, pdest) *((v8i16 *)(pdest)) = (vec) -#define STORE_UW(vec, pdest) *((v4u32 *)(pdest)) = (vec) -#define STORE_SW(vec, pdest) *((v4i32 *)(pdest)) = (vec) -#define STORE_UD(vec, pdest) *((v2u64 *)(pdest)) = (vec) -#define STORE_SD(vec, pdest) *((v2i64 *)(pdest)) = (vec) +#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) + +#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) + +#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) + +#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) #if (__mips_isa_rev >= 6) -#define LOAD_WORD(psrc) ({ \ - const uint8_t *src_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__ ( \ - "lw %[val_m], %[src_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [src_m] "m" (*src_m) \ - ); \ - \ - val_m; \ +#define LH(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__ ( \ + "lh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ +}) + +#define LW(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__ ( \ + "lw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ }) #if (__mips == 64) -#define LOAD_DWORD(psrc) ({ \ - const uint8_t *src_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__ ( \ - "ld %[val_m], %[src_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [src_m] "m" (*src_m) \ - ); \ - \ - val_m; \ +#define LD(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__ ( \ + "ld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ }) #else // !(__mips == 64) -#define LOAD_DWORD(psrc) ({ \ - const uint8_t *src1_m = (const uint8_t *)(psrc); \ - const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \ - uint32_t val0_m, val1_m; \ - uint64_t genval_m = 0; \ - \ - __asm__ __volatile__ ( \ - "lw %[val0_m], %[src1_m] \n\t" \ - \ - : [val0_m] "=r" (val0_m) \ - : [src1_m] "m" (*src1_m) \ - ); \ - \ - __asm__ __volatile__ ( \ - "lw %[val1_m], %[src2_m] \n\t" \ - \ - : [val1_m] "=r" (val1_m) \ - : [src2_m] "m" (*src2_m) \ - ); \ - \ - genval_m = (uint64_t)(val1_m); \ - genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \ - genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \ - \ - genval_m; \ +#define LD(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + \ + val0_m = LW(psrc_m); \ + val1_m = LW(psrc_m + 4); \ + \ + val_m = (uint64_t)(val1_m); \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + \ + val_m; \ }) #endif // (__mips == 64) -#define STORE_WORD_WITH_OFFSET_1(pdst, val) { \ - uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "sw %[val_m], %[dst_ptr_m] \n\t" \ - \ - : [dst_ptr_m] "=m" (*dst_ptr_m) \ - : [val_m] "r" (val_m) \ - ); \ -} - -#define STORE_WORD(pdst, val) { \ - uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "sw %[val_m], %[dst_ptr_m] \n\t" \ - \ - : [dst_ptr_m] "=m" (*dst_ptr_m) \ - : [val_m] "r" (val_m) \ - ); \ -} - -#define STORE_DWORD(pdst, val) { \ - uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ - const uint64_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "sd %[val_m], %[dst_ptr_m] \n\t" \ - \ - : [dst_ptr_m] "=m" (*dst_ptr_m) \ - : [val_m] "r" (val_m) \ - ); \ + +#define SH(val, pdst) { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__ ( \ + "sh %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ +} + +#define SW(val, pdst) { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__ ( \ + "sw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ +} + +#define SD(val, pdst) { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint64_t val_m = (val); \ + \ + __asm__ __volatile__ ( \ + "sd %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ } #else // !(__mips_isa_rev >= 6) -#define LOAD_WORD(psrc) ({ \ - const uint8_t *src_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__ ( \ - "ulw %[val_m], %[src_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [src_m] "m" (*src_m) \ - ); \ - \ - val_m; \ +#define LH(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__ ( \ + "ulh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ +}) + +#define LW(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__ ( \ + "ulw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ }) #if (__mips == 64) -#define LOAD_DWORD(psrc) ({ \ - const uint8_t *src_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__ ( \ - "uld %[val_m], %[src_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [src_m] "m" (*src_m) \ - ); \ - \ - val_m; \ +#define LD(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__ ( \ + "uld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ }) #else // !(__mips == 64) -#define LOAD_DWORD(psrc) ({ \ - const uint8_t *src1_m = (const uint8_t *)(psrc); \ - const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \ - uint32_t val0_m, val1_m; \ - uint64_t genval_m = 0; \ - \ - __asm__ __volatile__ ( \ - "ulw %[val0_m], %[src1_m] \n\t" \ - \ - : [val0_m] "=r" (val0_m) \ - : [src1_m] "m" (*src1_m) \ - ); \ - \ - __asm__ __volatile__ ( \ - "ulw %[val1_m], %[src2_m] \n\t" \ - \ - : [val1_m] "=r" (val1_m) \ - : [src2_m] "m" (*src2_m) \ - ); \ - \ - genval_m = (uint64_t)(val1_m); \ - genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \ - genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \ - \ - genval_m; \ +#define LD(psrc) ({ \ + const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + \ + val0_m = LW(psrc_m1); \ + val1_m = LW(psrc_m1 + 4); \ + \ + val_m = (uint64_t)(val1_m); \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + \ + val_m; \ }) #endif // (__mips == 64) -#define STORE_WORD_WITH_OFFSET_1(pdst, val) { \ - uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "usw %[val_m], %[dst_ptr_m] \n\t" \ - \ - : [dst_ptr_m] "=m" (*dst_ptr_m) \ - : [val_m] "r" (val_m) \ - ); \ +#define SH(val, pdst) { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__ ( \ + "ush %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ } -#define STORE_WORD(pdst, val) { \ - uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "usw %[val_m], %[dst_ptr_m] \n\t" \ - \ - : [dst_ptr_m] "=m" (*dst_ptr_m) \ - : [val_m] "r" (val_m) \ - ); \ +#define SW(val, pdst) { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__ ( \ + "usw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ } -#define STORE_DWORD(pdst, val) { \ - uint8_t *dst1_m = (uint8_t *)(pdst); \ - uint8_t *dst2_m = ((uint8_t *)(pdst)) + 4; \ +#define SD(val, pdst) { \ + uint8_t *pdst_m1 = (uint8_t *)(pdst); \ uint32_t val0_m, val1_m; \ \ val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ \ - __asm__ __volatile__ ( \ - "usw %[val0_m], %[dst1_m] \n\t" \ - "usw %[val1_m], %[dst2_m] \n\t" \ - \ - : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m) \ - : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m) \ - ); \ + SW(val0_m, pdst_m1); \ + SW(val1_m, pdst_m1 + 4); \ } #endif // (__mips_isa_rev >= 6) -#define LOAD_2VECS_UB(psrc, stride, \ - val0, val1) { \ - val0 = LOAD_UB(psrc + 0 * stride); \ - val1 = LOAD_UB(psrc + 1 * stride); \ -} - -#define LOAD_4VECS_UB(psrc, stride, \ - val0, val1, val2, val3) { \ - val0 = LOAD_UB(psrc + 0 * stride); \ - val1 = LOAD_UB(psrc + 1 * stride); \ - val2 = LOAD_UB(psrc + 2 * stride); \ - val3 = LOAD_UB(psrc + 3 * stride); \ -} - -#define LOAD_4VECS_SB(psrc, stride, \ - val0, val1, val2, val3) { \ - val0 = LOAD_SB(psrc + 0 * stride); \ - val1 = LOAD_SB(psrc + 1 * stride); \ - val2 = LOAD_SB(psrc + 2 * stride); \ - val3 = LOAD_SB(psrc + 3 * stride); \ -} - -#define LOAD_5VECS_UB(psrc, stride, \ - out0, out1, out2, out3, out4) { \ - LOAD_4VECS_UB((psrc), (stride), \ - (out0), (out1), (out2), (out3)); \ - out4 = LOAD_UB(psrc + 4 * stride); \ +/* Description : Load 4 words with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1, out2, out3 + Details : Loads word in 'out0' from (psrc) + Loads word in 'out1' from (psrc + stride) + Loads word in 'out2' from (psrc + 2 * stride) + Loads word in 'out3' from (psrc + 3 * stride) +*/ +#define LW4(psrc, stride, out0, out1, out2, out3) { \ + out0 = LW((psrc)); \ + out1 = LW((psrc) + stride); \ + out2 = LW((psrc) + 2 * stride); \ + out3 = LW((psrc) + 3 * stride); \ +} + +/* Description : Load double words with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1 + Details : Loads double word in 'out0' from (psrc) + Loads double word in 'out1' from (psrc + stride) +*/ +#define LD2(psrc, stride, out0, out1) { \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ +} +#define LD4(psrc, stride, out0, out1, out2, out3) { \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ +} + +/* Description : Store 4 words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Stores word from 'in0' to (pdst) + Stores word from 'in1' to (pdst + stride) + Stores word from 'in2' to (pdst + 2 * stride) + Stores word from 'in3' to (pdst + 3 * stride) +*/ +#define SW4(in0, in1, in2, in3, pdst, stride) { \ + SW(in0, (pdst)) \ + SW(in1, (pdst) + stride); \ + SW(in2, (pdst) + 2 * stride); \ + SW(in3, (pdst) + 3 * stride); \ +} + +/* Description : Store 4 double words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Stores double word from 'in0' to (pdst) + Stores double word from 'in1' to (pdst + stride) + Stores double word from 'in2' to (pdst + 2 * stride) + Stores double word from 'in3' to (pdst + 3 * stride) +*/ +#define SD4(in0, in1, in2, in3, pdst, stride) { \ + SD(in0, (pdst)) \ + SD(in1, (pdst) + stride); \ + SD(in2, (pdst) + 2 * stride); \ + SD(in3, (pdst) + 3 * stride); \ +} + +/* Description : Load vectors with 16 byte elements with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Loads 16 byte elements in 'out0' from (psrc) + Loads 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2(RTYPE, psrc, stride, out0, out1) { \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ +} +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ +} +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) + +#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ +} +#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) + +#define LD_B7(RTYPE, psrc, stride, \ + out0, out1, out2, out3, out4, out5, out6) { \ + LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ +} +#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) + +#define LD_B8(RTYPE, psrc, stride, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ +} +#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) + +/* Description : Load vectors with 8 halfword elements with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1 + Details : Loads 8 halfword elements in 'out0' from (psrc) + Loads 8 halfword elements in 'out1' from (psrc + stride) +*/ +#define LD_H2(RTYPE, psrc, stride, out0, out1) { \ + out0 = LD_H(RTYPE, (psrc)); \ + out1 = LD_H(RTYPE, (psrc) + (stride)); \ +} +#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) + +#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ + LD_H2(RTYPE, (psrc), stride, out0, out1); \ + LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ +} +#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) + +#define LD_H8(RTYPE, psrc, stride, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ +} +#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) + +#define LD_H16(RTYPE, psrc, stride, \ + out0, out1, out2, out3, out4, out5, out6, out7, \ + out8, out9, out10, out11, out12, out13, out14, out15) { \ + LD_H8(RTYPE, (psrc), stride, \ + out0, out1, out2, out3, out4, out5, out6, out7); \ + LD_H8(RTYPE, (psrc) + 8 * stride, stride, \ + out8, out9, out10, out11, out12, out13, out14, out15); \ +} +#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) + +/* Description : Load as 4x4 block of signed halfword elements from 1D source + data into 4 vectors (Each vector with 4 signed halfwords) + Arguments : Inputs - psrc + Outputs - out0, out1, out2, out3 +*/ +#define LD4x4_SH(psrc, out0, out1, out2, out3) { \ + out0 = LD_SH(psrc); \ + out2 = LD_SH(psrc + 8); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ +} + +/* Description : Load 2 vectors of signed word elements with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1 + Return Type - signed word +*/ +#define LD_SW2(psrc, stride, out0, out1) { \ + out0 = LD_SW((psrc)); \ + out1 = LD_SW((psrc) + stride); \ +} + +/* Description : Store vectors of 16 byte elements with stride + Arguments : Inputs - in0, in1, stride + Outputs - pdst (destination pointer to store to) + Details : Stores 16 byte elements from 'in0' to (pdst) + Stores 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ +} +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ +} +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + pdst, stride) { \ + ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ +} +#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, stride + Outputs - pdst (destination pointer to store to) + Details : Stores 8 halfword elements from 'in0' to (pdst) + Stores 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ +} +#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) + +#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ + ST_H2(RTYPE, in0, in1, (pdst), stride); \ + ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ +} +#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) + +#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \ + ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ + ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ +} +#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) + +/* Description : Store vectors of word elements with stride + Arguments : Inputs - in0, in1, stride + - pdst (destination pointer to store to) + Details : Store 4 word elements from 'in0' to (pdst) + Store 4 word elements from 'in1' to (pdst + stride) +*/ +#define ST_SW2(in0, in1, pdst, stride) { \ + ST_SW(in0, (pdst)); \ + ST_SW(in1, (pdst) + stride); \ +} + +/* Description : Store as 2x4 byte block to destination memory from input vector + Arguments : Inputs - in, stidx, pdst, stride + Return Type - unsigned byte + Details : Index stidx halfword element from 'in' vector is copied and + stored on first line + Index stidx+1 halfword element from 'in' vector is copied and + stored on second line + Index stidx+2 halfword element from 'in' vector is copied and + stored on third line + Index stidx+3 halfword element from 'in' vector is copied and + stored on fourth line +*/ +#define ST2x4_UB(in, stidx, pdst, stride) { \ + uint16_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ + out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ + out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ + out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ + \ + SH(out0_m, pblk_2x4_m); \ + SH(out1_m, pblk_2x4_m + stride); \ + SH(out2_m, pblk_2x4_m + 2 * stride); \ + SH(out3_m, pblk_2x4_m + 3 * stride); \ +} + +/* Description : Store 4x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 word element from 'in' vector is copied to a GP + register and stored to (pdst) + Index 1 word element from 'in' vector is copied to a GP + register and stored to (pdst + stride) +*/ +#define ST4x2_UB(in, pdst, stride) { \ + uint32_t out0_m, out1_m; \ + uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in, 0); \ + out1_m = __msa_copy_u_w((v4i32)in, 1); \ + \ + SW(out0_m, pblk_4x2_m); \ + SW(out1_m, pblk_4x2_m + stride); \ +} + +/* Description : Store as 4x4 byte block to destination memory from input vector + Arguments : Inputs - in0, in1, pdst, stride + Return Type - unsigned byte + Details : Idx0 word element from input vector 'in0' is copied and stored + on first line + Idx1 word element from input vector 'in0' is copied and stored + on second line + Idx2 word element from input vector 'in1' is copied and stored + on third line + Idx3 word element from input vector 'in1' is copied and stored + on fourth line +*/ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ + out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ + out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ + out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ + \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ +} +#define ST4x8_UB(in0, in1, pdst, stride) { \ + uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ + \ + ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ + ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ } -#define LOAD_5VECS_SB(psrc, stride, \ - out0, out1, out2, out3, out4) { \ - LOAD_4VECS_SB((psrc), (stride), \ - (out0), (out1), (out2), (out3)); \ - out4 = LOAD_SB(psrc + 4 * stride); \ -} +/* Description : Store as 8x1 byte block to destination memory from input vector + Arguments : Inputs - in, pdst + Details : Index 0 double word element from input vector 'in' is copied + and stored to destination memory at (pdst) +*/ +#define ST8x1_UB(in, pdst) { \ + uint64_t out0_m; \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + SD(out0_m, pdst); \ +} -#define LOAD_7VECS_SB(psrc, stride, \ - val0, val1, val2, val3, \ - val4, val5, val6) { \ - val0 = LOAD_SB((psrc) + 0 * (stride)); \ - val1 = LOAD_SB((psrc) + 1 * (stride)); \ - val2 = LOAD_SB((psrc) + 2 * (stride)); \ - val3 = LOAD_SB((psrc) + 3 * (stride)); \ - val4 = LOAD_SB((psrc) + 4 * (stride)); \ - val5 = LOAD_SB((psrc) + 5 * (stride)); \ - val6 = LOAD_SB((psrc) + 6 * (stride)); \ -} - -#define LOAD_8VECS_UB(psrc, stride, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) { \ - LOAD_4VECS_UB((psrc), (stride), \ - (out0), (out1), (out2), (out3)); \ - LOAD_4VECS_UB((psrc + 4 * stride), (stride), \ - (out4), (out5), (out6), (out7)); \ -} - -#define LOAD_8VECS_SB(psrc, stride, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) { \ - LOAD_4VECS_SB((psrc), (stride), \ - (out0), (out1), (out2), (out3)); \ - LOAD_4VECS_SB((psrc + 4 * stride), (stride), \ - (out4), (out5), (out6), (out7)); \ -} - -#define LOAD_2VECS_SH(psrc, stride, \ - val0, val1) { \ - val0 = LOAD_SH((psrc) + 0 * (stride)); \ - val1 = LOAD_SH((psrc) + 1 * (stride)); \ -} - -#define LOAD_4VECS_SH(psrc, stride, \ - val0, val1, val2, val3) { \ - LOAD_2VECS_SH((psrc), (stride), val0, val1); \ - LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \ -} - -#define LOAD_8VECS_SH(psrc, stride, \ - val0, val1, val2, val3, \ - val4, val5, val6, val7) { \ - LOAD_4VECS_SH((psrc), (stride), \ - val0, val1, val2, val3); \ - LOAD_4VECS_SH((psrc + 4 * stride), (stride), \ - val4, val5, val6, val7); \ -} - -#define LOAD_16VECS_SH(psrc, stride, \ - val0, val1, val2, val3, \ - val4, val5, val6, val7, \ - val8, val9, val10, val11, \ - val12, val13, val14, val15) { \ - LOAD_8VECS_SH((psrc), (stride), \ - val0, val1, val2, val3, \ - val4, val5, val6, val7); \ - LOAD_8VECS_SH((psrc + 8 * (stride)), (stride), \ - val8, val9, val10, val11, \ - val12, val13, val14, val15); \ -} - -#define STORE_4VECS_UB(dst_out, pitch, \ - in0, in1, in2, in3) { \ - STORE_UB((in0), (dst_out)); \ - STORE_UB((in1), ((dst_out) + (pitch))); \ - STORE_UB((in2), ((dst_out) + 2 * (pitch))); \ - STORE_UB((in3), ((dst_out) + 3 * (pitch))); \ -} - -#define STORE_8VECS_UB(dst_out, pitch_in, \ - in0, in1, in2, in3, \ - in4, in5, in6, in7) { \ - STORE_4VECS_UB(dst_out, pitch_in, \ - in0, in1, in2, in3); \ - STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in, \ - in4, in5, in6, in7); \ -} - -#define VEC_INSERT_4W_UB(src, src0, src1, src2, src3) { \ - src = (v16u8)__msa_insert_w((v4i32)(src), 0, (src0)); \ - src = (v16u8)__msa_insert_w((v4i32)(src), 1, (src1)); \ - src = (v16u8)__msa_insert_w((v4i32)(src), 2, (src2)); \ - src = (v16u8)__msa_insert_w((v4i32)(src), 3, (src3)); \ -} - -#define VEC_INSERT_2DW_UB(src, src0, src1) { \ - src = (v16u8)__msa_insert_d((v2i64)(src), 0, (src0)); \ - src = (v16u8)__msa_insert_d((v2i64)(src), 1, (src1)); \ -} - -#define STORE_4VECS_SH(ptr, stride, \ - in0, in1, in2, in3) { \ - STORE_SH(in0, ((ptr) + 0 * stride)); \ - STORE_SH(in1, ((ptr) + 1 * stride)); \ - STORE_SH(in2, ((ptr) + 2 * stride)); \ - STORE_SH(in3, ((ptr) + 3 * stride)); \ -} - -#define STORE_8VECS_SH(ptr, stride, \ - in0, in1, in2, in3, \ - in4, in5, in6, in7) { \ - STORE_SH(in0, ((ptr) + 0 * stride)); \ - STORE_SH(in1, ((ptr) + 1 * stride)); \ - STORE_SH(in2, ((ptr) + 2 * stride)); \ - STORE_SH(in3, ((ptr) + 3 * stride)); \ - STORE_SH(in4, ((ptr) + 4 * stride)); \ - STORE_SH(in5, ((ptr) + 5 * stride)); \ - STORE_SH(in6, ((ptr) + 6 * stride)); \ - STORE_SH(in7, ((ptr) + 7 * stride)); \ -} - -#define CLIP_UNSIGNED_CHAR_H(in) ({ \ +/* Description : Store as 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from input vector 'in' is copied + and stored to destination memory at (pdst) + Index 1 double word element from input vector 'in' is copied + and stored to destination memory at (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) { \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ +} + +/* Description : Store as 8x4 byte block to destination memory from input + vectors + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from input vector 'in0' is copied + and stored to destination memory at (pblk_8x4_m) + Index 1 double word element from input vector 'in0' is copied + and stored to destination memory at (pblk_8x4_m + stride) + Index 0 double word element from input vector 'in1' is copied + and stored to destination memory at (pblk_8x4_m + 2 * stride) + Index 1 double word element from input vector 'in1' is copied + and stored to destination memory at (pblk_8x4_m + 3 * stride) +*/ +#define ST8x4_UB(in0, in1, pdst, stride) { \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in0, 0); \ + out1_m = __msa_copy_u_d((v2i64)in0, 1); \ + out2_m = __msa_copy_u_d((v2i64)in1, 0); \ + out3_m = __msa_copy_u_d((v2i64)in1, 1); \ + \ + SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ +} + +/* Description : average with rounding (in0 + in1 + 1) / 2. + Arguments : Inputs - in0, in1, in2, in3, + Outputs - out0, out1 + Return Type - signed byte + Details : Each byte element from 'in0' vector is added with each byte + element from 'in1' vector. The addition of the elements plus 1 + (for rounding) is done unsigned with full precision, + i.e. the result has one extra bit. Unsigned division by 2 + (or logical shift right by one bit) is performed before writing + the result to vector 'out0' + Similar for the pair of 'in2' and 'in3' +*/ +#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ + out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ +} +#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) + +#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ +} +#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) + +/* Description : Immediate number of columns to slide with zero + Arguments : Inputs - in0, in1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'zero_m' vector are slide into 'in0' by + number of elements specified by 'slide_val' +*/ +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \ + v16i8 zero_m = { 0 }; \ + out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ +} +#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) + +#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \ + out0, out1, out2, out3, slide_val) { \ + SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ + SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ +} +#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) + +/* Description : Immediate number of columns to slide + Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by + number of elements specified by 'slide_val' +*/ +#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \ + out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ +} +#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) + +#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \ + out0, out1, out2, slide_val) { \ + SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ +} +#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) +#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) + +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Selective byte elements from in0 & in1 are copied to out0 as + per control vector mask0 + Selective byte elements from in2 & in3 are copied to out1 as + per control vector mask1 +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ +} +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) +#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) +#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) + +#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \ + out0, out1, out2, out3) { \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ +} +#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) +#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1 + cnst0, cnst1 + Outputs - out0, out1 + Return Type - unsigned halfword + Details : Unsigned byte elements from mult0 are multiplied with + unsigned byte elements from cnst0 producing a result + twice the size of input i.e. unsigned halfword. + Then this multiplication results of adjacent odd-even elements + are added together and stored to the out vector + (2 unsigned halfword results) +*/ +#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ + out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ + out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ +} +#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) + +#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, \ + out0, out1, out2, out3) { \ + DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ +} +#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1 + cnst0, cnst1 + Outputs - out0, out1 + Return Type - signed halfword + Details : Signed byte elements from mult0 are multiplied with + signed byte elements from cnst0 producing a result + twice the size of input i.e. signed halfword. + Then this multiplication results of adjacent odd-even elements + are added together and stored to the out vector + (2 signed halfword results) +*/ +#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ + out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ +} +#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) + +#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ + DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ +} +#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product of halfword vector elements + Arguments : Inputs - mult0, mult1 + cnst0, cnst1 + Outputs - out0, out1 + Return Type - signed word + Details : Signed halfword elements from mult0 are multiplied with + signed halfword elements from cnst0 producing a result + twice the size of input i.e. signed word. + Then this multiplication results of adjacent odd-even elements + are added together and stored to the out vector + (2 signed word results) +*/ +#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ + out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ +} +#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) + +#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, \ + out0, out1, out2, out3) { \ + DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ +} +#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) + +/* Description : Dot product of word vector elements + Arguments : Inputs - mult0, mult1 + cnst0, cnst1 + Outputs - out0, out1 + Return Type - signed word + Details : Signed word elements from mult0 are multiplied with + signed word elements from cnst0 producing a result + twice the size of input i.e. signed double word. + Then this multiplication results of adjacent odd-even elements + are added together and stored to the out vector + (2 signed double word results) +*/ +#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ + out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ +} +#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) + +/* Description : Dot product & addition of byte vector elements + Arguments : Inputs - mult0, mult1 + cnst0, cnst1 + Outputs - out0, out1 + Return Type - signed halfword + Details : Signed byte elements from mult0 are multiplied with + signed byte elements from cnst0 producing a result + twice the size of input i.e. signed halfword. + Then this multiplication results of adjacent odd-even elements + are added to the out vector + (2 signed halfword results) +*/ +#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ + out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ +} +#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) + +#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ + DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ +} +#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product & addition of halfword vector elements + Arguments : Inputs - mult0, mult1 + cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ + out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ +} +#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) + +/* Description : Dot product & addition of double word vector elements + Arguments : Inputs - mult0, mult1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed word element from 'mult0' is multiplied with itself + producing an intermediate result twice the size of input + i.e. signed double word + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) { \ + out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ + out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ +} +#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) + +/* Description : Minimum values between unsigned elements of + either vector are copied to the output vector + Arguments : Inputs - in0, in1, min_vec + Outputs - in0, in1, (in place) + Return Type - unsigned halfword + Details : Minimum of unsigned halfword element values from 'in0' and + 'min_value' are written to output vector 'in0' +*/ +#define MIN_UH2(RTYPE, in0, in1, min_vec) { \ + in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ + in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ +} +#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) + +#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) { \ + MIN_UH2(RTYPE, in0, in1, min_vec); \ + MIN_UH2(RTYPE, in2, in3, min_vec); \ +} +#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) + +/* Description : Clips all signed halfword elements of input vector + between 0 & 255 + Arguments : Inputs - in (input vector) + Outputs - out_m (output vector with clipped elements) + Return Type - signed halfword +*/ +#define CLIP_SH_0_255(in) ({ \ v8i16 max_m = __msa_ldi_h(255); \ v8i16 out_m; \ \ - out_m = __msa_maxi_s_h((v8i16)(in), 0); \ + out_m = __msa_maxi_s_h((v8i16)in, 0); \ out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ out_m; \ }) +#define CLIP_SH2_0_255(in0, in1) { \ + in0 = CLIP_SH_0_255(in0); \ + in1 = CLIP_SH_0_255(in1); \ +} +#define CLIP_SH4_0_255(in0, in1, in2, in3) { \ + CLIP_SH2_0_255(in0, in1); \ + CLIP_SH2_0_255(in2, in3); \ +} -/* halfword 8x8 transpose macro */ -#define TRANSPOSE8x8_H_SH(in0, in1, in2, in3, \ - in4, in5, in6, in7, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) { \ - v8i16 s0_m, s1_m; \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - s0_m = __msa_ilvr_h((v8i16)(in6), (v8i16)(in4)); \ - s1_m = __msa_ilvr_h((v8i16)(in7), (v8i16)(in5)); \ - tmp0_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ - tmp1_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ - \ - s0_m = __msa_ilvl_h((v8i16)(in6), (v8i16)(in4)); \ - s1_m = __msa_ilvl_h((v8i16)(in7), (v8i16)(in5)); \ - tmp2_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ - tmp3_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ - \ - s0_m = __msa_ilvr_h((v8i16)(in2), (v8i16)(in0)); \ - s1_m = __msa_ilvr_h((v8i16)(in3), (v8i16)(in1)); \ - tmp4_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ - tmp5_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ - \ - s0_m = __msa_ilvl_h((v8i16)(in2), (v8i16)(in0)); \ - s1_m = __msa_ilvl_h((v8i16)(in3), (v8i16)(in1)); \ - tmp6_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ - tmp7_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ - \ - out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ - out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ - out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ - out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ - out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ - out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ - out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ - out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ -} - -/* interleave macros */ -/* no in-place support */ -#define ILV_B_LRLR_UB(in0, in1, in2, in3, \ - out0, out1, out2, out3) { \ - out0 = (v16u8)__msa_ilvl_b((v16i8)(in1), (v16i8)(in0)); \ - out1 = (v16u8)__msa_ilvr_b((v16i8)(in1), (v16i8)(in0)); \ - out2 = (v16u8)__msa_ilvl_b((v16i8)(in3), (v16i8)(in2)); \ - out3 = (v16u8)__msa_ilvr_b((v16i8)(in3), (v16i8)(in2)); \ -} - -#define ILV_H_LRLR_SH(in0, in1, in2, in3, \ - out0, out1, out2, out3) { \ - out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \ - out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \ - out2 = __msa_ilvl_h((v8i16)(in3), (v8i16)(in2)); \ - out3 = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2)); \ -} - -#define ILV_H_LR_SH(in0, in1, out0, out1) { \ - out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \ - out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \ -} - -#define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1) { \ - out0 = (v16u8)__msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \ - out1 = (v16u8)__msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \ -} - -#define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1) { \ - out0 = __msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \ - out1 = __msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \ -} - -#define ILVR_B_4VECS_UB(in0_r, in1_r, in2_r, in3_r, \ - in0_l, in1_l, in2_l, in3_l, \ - out0, out1, out2, out3) { \ - ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_B_2VECS_UB(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ -} - -#define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ - in0_l, in1_l, in2_l, in3_l, \ - out0, out1, out2, out3) { \ - ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ -} - -#define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r, \ - in3_r, in4_r, in5_r, \ - in0_l, in1_l, in2_l, \ - in3_l, in4_l, in5_l, \ - out0, out1, out2, \ - out3, out4, out5) { \ - ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ - ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ - out4, out5); \ -} - -#define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r, \ - in4_r, in5_r, in6_r, in7_r, \ - in0_l, in1_l, in2_l, in3_l, \ - in4_l, in5_l, in6_l, in7_l, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) { \ - ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ - ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ - out4, out5); \ - ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l, \ - out6, out7); \ -} - -#define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1) { \ - out0 = __msa_ilvl_b((v16i8)(in0_l), (v16i8)(in0_r)); \ - out1 = __msa_ilvl_b((v16i8)(in1_l), (v16i8)(in1_r)); \ -} - -#define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ - in0_l, in1_l, in2_l, in3_l, \ - out0, out1, out2, out3) { \ - ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ -} - -#define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r, \ - in3_r, in4_r, in5_r, \ - in0_l, in1_l, in2_l, \ - in3_l, in4_l, in5_l, \ - out0, out1, out2, \ - out3, out4, out5) { \ - ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ - out0, out1); \ - ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ - out2, out3); \ - ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ - out4, out5); \ -} - -#define ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ - out1, in1_l, in1_r) { \ - out0 = (v16i8)__msa_ilvr_d((v2i64)(in0_l), (v2i64)(in0_r)); \ - out1 = (v16i8)__msa_ilvr_d((v2i64)(in1_l), (v2i64)(in1_r)); \ -} - -#define ILVR_D_3VECS_SB(out0, in0_l, in0_r, \ - out1, in1_l, in1_r, \ - out2, in2_l, in2_r) { \ - ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ - out1, in1_l, in1_r); \ - out2 = (v16i8)__msa_ilvr_d((v2i64)(in2_l), (v2i64)(in2_r)); \ -} - -#define ILVR_D_4VECS_SB(out0, in0_l, in0_r, \ - out1, in1_l, in1_r, \ - out2, in2_l, in2_r, \ - out3, in3_l, in3_r) { \ - ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ - out1, in1_l, in1_r); \ - ILVR_D_2VECS_SB(out2, in2_l, in2_r, \ - out3, in3_l, in3_r); \ -} - -#define DOTP_S_W_4VECS_SW(m0, c0, m1, c1, \ - m2, c2, m3, c3, \ - out0, out1, out2, out3) { \ - out0 = __msa_dotp_s_w((v8i16)(m0), (v8i16)(c0)); \ - out1 = __msa_dotp_s_w((v8i16)(m1), (v8i16)(c1)); \ - out2 = __msa_dotp_s_w((v8i16)(m2), (v8i16)(c2)); \ - out3 = __msa_dotp_s_w((v8i16)(m3), (v8i16)(c3)); \ -} - -#define PCKEV_H_2VECS_SH(in0_l, in0_r, in1_l, in1_r, \ - out0, out1) { \ - out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r)); \ - out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r)); \ -} - -#define XORI_B_2VECS_UB(val0, val1, \ - out0, out1, xor_val) { \ - out0 = __msa_xori_b((v16u8)(val0), (xor_val)); \ - out1 = __msa_xori_b((v16u8)(val1), (xor_val)); \ -} - -#define XORI_B_2VECS_SB(val0, val1, \ - out0, out1, xor_val) { \ - out0 = (v16i8)__msa_xori_b((v16u8)(val0), (xor_val)); \ - out1 = (v16i8)__msa_xori_b((v16u8)(val1), (xor_val)); \ -} - -#define XORI_B_3VECS_SB(val0, val1, val2, \ - out0, out1, out2, xor_val) { \ - XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \ - out2 = (v16i8)__msa_xori_b((v16u8)(val2), (xor_val)); \ -} - -#define XORI_B_4VECS_UB(val0, val1, val2, val3, \ - out0, out1, out2, out3, \ - xor_val) { \ - XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val); \ - XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val); \ -} - -#define XORI_B_4VECS_SB(val0, val1, val2, val3, \ - out0, out1, out2, out3, \ - xor_val) { \ - XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \ - XORI_B_2VECS_SB(val2, val3, out2, out3, xor_val); \ +/* Description : Addition of 4 signed word elements + 4 signed word elements of input vector are added together and + the resulting integer sum is returned + Arguments : Inputs - in (signed word vector) + Outputs - sum_m (i32 sum) + Return Type - signed word +*/ +#define HADD_SW_S32(in) ({ \ + v2i64 res0_m, res1_m; \ + int32_t sum_m; \ + \ + res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + res1_m = __msa_splati_d(res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ + sum_m; \ +}) + +/* Description : Horizontal addition of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is added to + even unsigned byte element from 'in0' (pairwise) and the + halfword result is stored in 'out0' +*/ +#define HADD_UB2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ } +#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) -#define XORI_B_7VECS_SB(val0, val1, val2, val3, \ - val4, val5, val6, \ - out0, out1, out2, out3, \ - out4, out5, out6, \ - xor_val) { \ - XORI_B_4VECS_SB(val0, val1, val2, val3, \ - out0, out1, out2, out3, xor_val); \ - XORI_B_3VECS_SB(val4, val5, val6, \ - out4, out5, out6, xor_val); \ +#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \ + HADD_UB2(RTYPE, in0, in1, out0, out1); \ + HADD_UB2(RTYPE, in2, in3, out2, out3); \ +} +#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) + +/* Description : Horizontal subtraction of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is subtracted from + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ +} +#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) + +/* Description : Horizontal subtraction of signed halfword vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed odd halfword element from 'in0' is subtracted from + even signed halfword element from 'in0' (pairwise) and the + word result is written to 'out0' +*/ +#define HSUB_UH2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ + out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ +} +#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) + +/* Description : Insert specified word elements from input vectors to 1 + destination vector + Arguments : Inputs - in0, in1, in2, in3 (4 input vectors) + Outputs - out (output vector) + Return Type - as per RTYPE +*/ +#define INSERT_W2(RTYPE, in0, in1, out) { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ } +#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) -#define SRARI_H_4VECS_UH(val0, val1, val2, val3, \ - out0, out1, out2, out3, \ - shift_right_val) { \ - out0 = (v8u16)__msa_srari_h((v8i16)(val0), (shift_right_val)); \ - out1 = (v8u16)__msa_srari_h((v8i16)(val1), (shift_right_val)); \ - out2 = (v8u16)__msa_srari_h((v8i16)(val2), (shift_right_val)); \ - out3 = (v8u16)__msa_srari_h((v8i16)(val3), (shift_right_val)); \ +#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ +} +#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) +#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) + +/* Description : Insert specified double word elements from input vectors to 1 + destination vector + Arguments : Inputs - in0, in1 (2 input vectors) + Outputs - out (output vector) + Return Type - as per RTYPE +*/ +#define INSERT_D2(RTYPE, in0, in1, out) { \ + out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ + out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ +} +#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) +#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) + +/* Description : Interleave even byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' and even byte + elements of 'in1' are interleaved and copied to 'out0' + Even byte elements of 'in2' and even byte + elements of 'in3' are interleaved and copied to 'out1' +*/ +#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ +} +#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) +#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) + +/* Description : Interleave even halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' and even halfword + elements of 'in1' are interleaved and copied to 'out0' + Even halfword elements of 'in2' and even halfword + elements of 'in3' are interleaved and copied to 'out1' +*/ +#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ } +#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) +#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) +#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave even word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ + out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ +} +#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) + +/* Description : Interleave even double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and even double word + elements of 'in1' are interleaved and copied to 'out0' + Even double word elements of 'in2' and even double word + elements of 'in3' are interleaved and copied to 'out1' +*/ +#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ + out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ +} +#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) + +/* Description : Interleave left half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of byte elements of in0 and left half of byte + elements of in1 are interleaved and copied to out0. + Left half of byte elements of in2 and left half of byte + elements of in3 are interleaved and copied to out1. +*/ +#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ +} +#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) +#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) +#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) +#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) + +#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) +#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) + +/* Description : Interleave left half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of halfword elements of in0 and left half of halfword + elements of in1 are interleaved and copied to out0. + Left half of halfword elements of in2 and left half of halfword + elements of in3 are interleaved and copied to out1. +*/ +#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ +} +#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) + +/* Description : Interleave left half of word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of word elements of in0 and left half of word + elements of in1 are interleaved and copied to out0. + Left half of word elements of in2 and left half of word + elements of in3 are interleaved and copied to out1. +*/ +#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ +} +#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) +#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Right half of byte elements of in0 and right half of byte + elements of in1 are interleaved and copied to out0. + Right half of byte elements of in2 and right half of byte + elements of in3 are interleaved and copied to out1. + Similar for other pairs +*/ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ +} +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) + +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) + +#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3); \ + ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \ + out4, out5, out6, out7); \ +} +#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword + Details : Right half of halfword elements of in0 and right half of + halfword elements of in1 are interleaved and copied to out0. + Right half of halfword elements of in2 and right half of + halfword elements of in3 are interleaved and copied to out1. + Similar for other pairs +*/ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ +} +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) + +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) -#define SRARI_H_4VECS_SH(val0, val1, val2, val3, \ - out0, out1, out2, out3, \ - shift_right_val) { \ - out0 = __msa_srari_h((v8i16)(val0), (shift_right_val)); \ - out1 = __msa_srari_h((v8i16)(val1), (shift_right_val)); \ - out2 = __msa_srari_h((v8i16)(val2), (shift_right_val)); \ - out3 = __msa_srari_h((v8i16)(val3), (shift_right_val)); \ +#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ } +#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) +#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) -#define SRARI_W_4VECS_SW(val0, val1, val2, val3, \ - out0, out1, out2, out3, \ - shift_right_val) { \ - out0 = __msa_srari_w((v4i32)(val0), (shift_right_val)); \ - out1 = __msa_srari_w((v4i32)(val1), (shift_right_val)); \ - out2 = __msa_srari_w((v4i32)(val2), (shift_right_val)); \ - out3 = __msa_srari_w((v4i32)(val3), (shift_right_val)); \ +#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ } +#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - unsigned double word + Details : Right half of double word elements of in0 and right half of + double word elements of in1 are interleaved and copied to out0. + Right half of double word elements of in2 and right half of + double word elements of in3 are interleaved and copied to out1. +*/ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ +} +#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) + +#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ +} +#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) -#define SRARI_SATURATE_UNSIGNED_H(input, right_shift_val, sat_val) ({ \ - v8u16 out_m; \ +#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) +#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and stored to 'out0' + Left half of byte elements from 'in0' and 'in1' are + interleaved and stored to 'out1' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ +} +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) +#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) +#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) +#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ +} +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ +} +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val+1 bits) + The element data width remains unchanged + Arguments : Inputs - in0, in1, in2, in3, sat_val + Outputs - in0, in1, in2, in3 (in place) + Return Type - unsigned halfword + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val+1) bit range. + The results are stored in place +*/ +#define SAT_UH2(RTYPE, in0, in1, sat_val) { \ + in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ +} +#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) + +#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) { \ + SAT_UH2(RTYPE, in0, in1, sat_val); \ + SAT_UH2(RTYPE, in2, in3, sat_val) \ +} +#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val+1 bits) + The element data width remains unchanged + Arguments : Inputs - in0, in1, in2, in3, sat_val + Outputs - in0, in1, in2, in3 (in place) + Return Type - unsigned halfword + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val+1) bit range + The results are stored in place +*/ +#define SAT_SH2(RTYPE, in0, in1, sat_val) { \ + in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ +} +#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) + +#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) { \ + SAT_SH2(RTYPE, in0, in1, sat_val); \ + SAT_SH2(RTYPE, in2, in3, sat_val); \ +} +#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) + +/* Description : Indexed halfword element values are replicated to all + elements in output vector + Arguments : Inputs - in, idx0, idx1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'idx0' element value from 'in' vector is replicated to all + elements in 'out0' vector + Valid index range for halfword operation is 0-7 +*/ +#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) { \ + out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ + out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ +} +#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) + +#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \ + out0, out1, out2, out3) { \ + SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ + SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ +} +#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) +#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of in0 are copied to the left half of + out0 & even byte elements of in1 are copied to the right + half of out0. + Even byte elements of in2 are copied to the left half of + out1 & even byte elements of in3 are copied to the right + half of out1. +*/ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ +} +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) + +#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) +#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) +#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) + +/* Description : Pack even halfword elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of in0 are copied to the left half of + out0 & even halfword elements of in1 are copied to the right + half of out0. + Even halfword elements of in2 are copied to the left half of + out1 & even halfword elements of in3 are copied to the right + half of out1. +*/ +#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ +} +#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) +#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) + +#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - unsigned byte + Details : Even double elements of in0 are copied to the left half of + out0 & even double elements of in1 are copied to the right + half of out0. + Even double elements of in2 are copied to the left half of + out1 & even double elements of in3 are copied to the right + half of out1. +*/ +#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ +} +#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) +#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) + +#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) + +/* Description : Each byte element is logically xor'ed with immediate 128 + Arguments : Inputs - in0, in1 + Outputs - in0, in1 (in-place) + Return Type - as per RTYPE + Details : Each unsigned byte element from input vector 'in0' is + logically xor'ed with 128 and the result is in-place stored in + 'in0' vector + Each unsigned byte element from input vector 'in1' is + logically xor'ed with 128 and the result is in-place stored in + 'in1' vector + Similar for other pairs +*/ +#define XORI_B2_128(RTYPE, in0, in1) { \ + in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ + in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ +} +#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) +#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) + +#define XORI_B3_128(RTYPE, in0, in1, in2) { \ + XORI_B2_128(RTYPE, in0, in1); \ + in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ +} +#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) + +#define XORI_B4_128(RTYPE, in0, in1, in2, in3) { \ + XORI_B2_128(RTYPE, in0, in1); \ + XORI_B2_128(RTYPE, in2, in3); \ +} +#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) +#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) + +#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) { \ + XORI_B4_128(RTYPE, in0, in1, in2, in3); \ + XORI_B3_128(RTYPE, in4, in5, in6); \ +} +#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) + +/* Description : Average of signed halfword elements -> (a + b) / 2 + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each signed halfword element from 'in0' is added to each + signed halfword element of 'in1' with full precision resulting + in one extra bit in the result. The result is then divided by + 2 and written to 'out0' +*/ +#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ + out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ + out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ +} +#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) + +/* Description : Addition of signed halfword elements and signed saturation + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'in0' are added to signed + halfword elements of 'in1'. The result is then signed saturated + between -32768 to +32767 (as per halfword data type) + Similar for other pairs +*/ +#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ +} +#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) + +#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) + +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in0, in1, in2, in3 (in place) + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + the result is in place written to 'in0' + Similar for other pairs +*/ +#define SLLI_4V(in0, in1, in2, in3, shift) { \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ +} + +/* Description : Arithmetic shift right all elements of vector + (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in0, in1, in2, in3 (in place) + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is in place written to 'in0' + Here, 'shift' is GP variable passed in + Similar for other pairs +*/ +#define SRA_4V(in0, in1, in2, in3, shift) { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + in2 = in2 >> shift; \ + in3 = in3 >> shift; \ +} + +/* Description : Shift right arithmetic rounded words + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the number of bits in the corresponding element in the vector + 'shift'. The last discarded bit is added to shifted value for + rounding and the result is written in-place. + 'shift' is a vector. +*/ +#define SRAR_W2(RTYPE, in0, in1, shift) { \ + in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ + in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ +} + +#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) { \ + SRAR_W2(RTYPE, in0, in1, shift) \ + SRAR_W2(RTYPE, in2, in3, shift) \ +} +#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) + +/* Description : Shift right arithmetic rounded (immediate) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in0, in1, in2, in3 (in place) + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetic by + value in 'shift'. + The last discarded bit is added to shifted value for rounding + and the result is in place written to 'in0' + Similar for other pairs +*/ +#define SRARI_H2(RTYPE, in0, in1, shift) { \ + in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ + in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ +} +#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) +#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) + +#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) { \ + SRARI_H2(RTYPE, in0, in1, shift); \ + SRARI_H2(RTYPE, in2, in3, shift); \ +} +#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) +#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) + +/* Description : Shift right arithmetic rounded (immediate) + Arguments : Inputs - in0, in1, shift + Outputs - in0, in1 (in place) + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetic by + value in 'shift'. + The last discarded bit is added to shifted value for rounding + and the result is in place written to 'in0' + Similar for other pairs +*/ +#define SRARI_W2(RTYPE, in0, in1, shift) { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ +} +#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) + +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ +} +#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) + +/* Description : Logical shift right all elements of vector (immediate) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is an immediate value. +*/ +#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) { \ + out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ + out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ + out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ + out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ +} +#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) { \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ +} +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Addition of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 2 pairs vectors is added and 2 results are + produced +*/ +#define ADD2(in0, in1, in2, in3, out0, out1) { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ +} +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Subtraction of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 2 pairs vectors is subtracted and 2 results + are produced +*/ +#define SUB2(in0, in1, in2, in3, out0, out1) { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ +} +#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + out2 = in4 - in5; \ + out3 = in6 - in7; \ +} + +/* Description : Sign extend halfword elements from right half of the vector + Arguments : Inputs - in (input halfword vector) + Outputs - out (sign extended word vectors) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved with same vector 'in0' to generate + 4 word elements keeping sign intact +*/ +#define UNPCK_R_SH_SW(in, out) { \ + v8i16 sign_m; \ + \ + sign_m = __msa_clti_s_h((v8i16)in, 0); \ + out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ +} + +/* Description : Zero extend unsigned byte elements to halfword elements + Arguments : Inputs - in (1 input unsigned byte vector) + Outputs - out0, out1 (unsigned 2 halfword vectors) + Return Type - signed halfword + Details : Zero extended right half of vector is returned in 'out0' + Zero extended left half of vector is returned in 'out1' +*/ +#define UNPCK_UB_SH(in, out0, out1) { \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH(zero_m, in, out0, out1); \ +} + +/* Description : Sign extend halfword elements from input vector and return + result in pair of vectors + Arguments : Inputs - in (1 input halfword vector) + Outputs - out0, out1 (sign extended 2 word vectors) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 4 signed word elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 4 signed word elements in 'out1' +*/ +#define UNPCK_SH_SW(in, out0, out1) { \ + v8i16 tmp_m; \ + \ + tmp_m = __msa_clti_s_h((v8i16)in, 0); \ + ILVRL_H2_SW(tmp_m, in, out0, out1); \ +} + +/* Description : Butterfly of 4 input vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Details : Butterfly operation +*/ +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ +} + +/* Description : Butterfly of 8 input vectors + Arguments : Inputs - in0 ... in7 + Outputs - out0 .. out7 + Details : Butterfly operation +*/ +#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + out0 = in0 + in7; \ + out1 = in1 + in6; \ + out2 = in2 + in5; \ + out3 = in3 + in4; \ \ - out_m = (v8u16)__msa_srari_h((v8i16)(input), (right_shift_val)); \ - out_m = __msa_sat_u_h(out_m, (sat_val)); \ - out_m; \ -}) + out4 = in3 - in4; \ + out5 = in2 - in5; \ + out6 = in1 - in6; \ + out7 = in0 - in7; \ +} -#define SRARI_SATURATE_SIGNED_H(input, right_shift_val, sat_val) ({ \ - v8i16 out_m; \ - \ - out_m = __msa_srari_h((v8i16)(input), (right_shift_val)); \ - out_m = __msa_sat_s_h(out_m, (sat_val)); \ - out_m; \ -}) +/* Description : Butterfly of 16 input vectors + Arguments : Inputs - in0 ... in15 + Outputs - out0 .. out15 + Details : Butterfly operation +*/ +#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7, \ + out8, out9, out10, out11, out12, out13, out14, out15) { \ + out0 = in0 + in15; \ + out1 = in1 + in14; \ + out2 = in2 + in13; \ + out3 = in3 + in12; \ + out4 = in4 + in11; \ + out5 = in5 + in10; \ + out6 = in6 + in9; \ + out7 = in7 + in8; \ + \ + out8 = in7 - in8; \ + out9 = in6 - in9; \ + out10 = in5 - in10; \ + out11 = in4 - in11; \ + out12 = in3 - in12; \ + out13 = in2 - in13; \ + out14 = in1 - in14; \ + out15 = in0 - in15; \ +} -#define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2, \ - pdst, stride) { \ - uint32_t out0_m, out1_m, out2_m, out3_m; \ - v16i8 tmp0_m; \ - uint8_t *dst_m = (uint8_t *)(pdst); \ - \ - tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ - tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \ - \ - out0_m = __msa_copy_u_w((v4i32)tmp0_m, 0); \ - out1_m = __msa_copy_u_w((v4i32)tmp0_m, 1); \ - out2_m = __msa_copy_u_w((v4i32)tmp0_m, 2); \ - out3_m = __msa_copy_u_w((v4i32)tmp0_m, 3); \ - \ - STORE_WORD(dst_m, out0_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out1_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out2_m); \ - dst_m += stride; \ - STORE_WORD(dst_m, out3_m); \ -} - -#define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2, \ - in3, in4, \ - pdst, stride) { \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - v16i8 tmp0_m, tmp1_m; \ - uint8_t *dst_m = (uint8_t *)(pdst); \ - \ - tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ - tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ - \ - tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \ - tmp1_m = (v16i8)__msa_xori_b((v16u8)tmp1_m, 128); \ - \ - out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ - out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ - out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ - out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ - \ - STORE_DWORD(dst_m, out0_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out1_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out2_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out3_m); \ -} - -/* Only for signed vecs */ -#define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) { \ - v16i8 tmp_m; \ - \ - tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ - tmp_m = (v16i8)__msa_xori_b((v16u8)tmp_m, 128); \ - STORE_SB(tmp_m, (pdest)); \ -} - -/* Only for signed vecs */ -#define PCKEV_B_4_XORI128_AVG_STORE_8_BYTES_4(in1, dst0, \ - in2, dst1, \ - in3, dst2, \ - in4, dst3, \ - pdst, stride) { \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - uint8_t *dst_m = (uint8_t *)(pdst); \ - \ - tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ - tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ - \ - tmp2_m = (v16u8)__msa_ilvr_d((v2i64)(dst1), (v2i64)(dst0)); \ - tmp3_m = (v16u8)__msa_ilvr_d((v2i64)(dst3), (v2i64)(dst2)); \ - \ - tmp0_m = __msa_xori_b(tmp0_m, 128); \ - tmp1_m = __msa_xori_b(tmp1_m, 128); \ - \ - tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \ - tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \ - \ - out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ - out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ - out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ - out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ - \ - STORE_DWORD(dst_m, out0_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out1_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out2_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out3_m); \ -} - -/* Only for signed vecs */ -#define PCKEV_B_XORI128_AVG_STORE_VEC(in1, in2, dst, pdest) { \ - v16u8 tmp_m; \ - \ - tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ - tmp_m = __msa_xori_b(tmp_m, 128); \ - tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \ - STORE_UB(tmp_m, (pdest)); \ -} - -#define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4, \ - pdst, stride) { \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - v16i8 tmp0_m, tmp1_m; \ - uint8_t *dst_m = (uint8_t *)(pdst); \ - \ - tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ - tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ - \ - out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ - out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ - out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ - out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ - \ - STORE_DWORD(dst_m, out0_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out1_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out2_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out3_m); \ -} - -/* Only for unsigned vecs */ -#define PCKEV_B_STORE_VEC(in1, in2, pdest) { \ - v16i8 tmp_m; \ - \ - tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ - STORE_SB(tmp_m, (pdest)); \ +/* Description : Transposes input 8x8 byte block + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + (input 8x8 byte block) + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + (output 8x8 byte block) + Return Type - unsigned byte + Details : +*/ +#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ + ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ + ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ + ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ + SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ + SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ +} +#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) + +/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - unsigned byte + Details : +*/ +#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ + ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ + ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ + ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ + \ + tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ + tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ + tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ + tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ + out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ + tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ + out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ + tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ + \ + ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ + out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ + out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ + out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ } -#define PCKEV_B_AVG_STORE_8_BYTES_4(in1, dst0, in2, dst1, \ - in3, dst2, in4, dst3, \ - pdst, stride) { \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - uint8_t *dst_m = (uint8_t *)(pdst); \ - \ - tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ - tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ - \ - tmp2_m = (v16u8)__msa_pckev_d((v2i64)(dst1), (v2i64)(dst0)); \ - tmp3_m = (v16u8)__msa_pckev_d((v2i64)(dst3), (v2i64)(dst2)); \ - \ - tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \ - tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \ +/* Description : Transposes 4x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword + Details : +*/ +#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v8i16 s0_m, s1_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ + ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ +} + +/* Description : Transposes 4x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword + Details : +*/ +#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ + v8i16 zero_m = { 0 }; \ + \ + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ + tmp0_n, tmp1_n, tmp2_n, tmp3_n); \ + ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ + ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ + \ + out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + \ + out4 = zero_m; \ + out5 = zero_m; \ + out6 = zero_m; \ + out7 = zero_m; \ +} + +/* Description : Transposes 8x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword + Details : +*/ +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ + ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ +} + +/* Description : Transposes 8x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword + Details : +*/ +#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v8i16 s0_m, s1_m; \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ + ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ + ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ + PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ + tmp3_m, tmp7_m, out0, out2, out4, out6); \ + out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ + out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ + out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ + out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ +} +#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) + +/* Description : Transposes 4x4 block with word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed word + Details : +*/ +#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + \ + out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ +} + +/* Description : Add block 4x4 + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Outputs - + Return Type - unsigned bytes + Details : Least significant 4 bytes from each input vector are added to + the destination bytes, clipped between 0-255 and then stored. +*/ +#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + v16i8 zero_m = { 0 }; \ \ - out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ - out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ - out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ - out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ + ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ + LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ + INSERT_W2_SB(src0_m, src1_m, dst0_m); \ + INSERT_W2_SB(src2_m, src3_m, dst1_m); \ + ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ + ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ + CLIP_SH2_0_255(res0_m, res1_m); \ + PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ \ - STORE_DWORD(dst_m, out0_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out1_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out2_m); \ - dst_m += stride; \ - STORE_DWORD(dst_m, out3_m); \ -} - -#define PCKEV_B_AVG_STORE_VEC(in1, in2, dst, pdest) { \ - v16u8 tmp_m; \ - \ - tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ - tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \ - STORE_UB(tmp_m, (pdest)); \ -} - -/* Generic for Vector types and GP operations */ -#define BUTTERFLY_4(in0, in1, in2, in3, \ - out0, out1, out2, out3) { \ - out0 = (in0) + (in3); \ - out1 = (in1) + (in2); \ - \ - out2 = (in1) - (in2); \ - out3 = (in0) - (in3); \ -} - -/* Generic for Vector types and GP operations */ -#define BUTTERFLY_8(in0, in1, in2, in3, \ - in4, in5, in6, in7, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7) { \ - out0 = (in0) + (in7); \ - out1 = (in1) + (in6); \ - out2 = (in2) + (in5); \ - out3 = (in3) + (in4); \ - \ - out4 = (in3) - (in4); \ - out5 = (in2) - (in5); \ - out6 = (in1) - (in6); \ - out7 = (in0) - (in7); \ -} -#endif /* HAVE_MSA */ + out0_m = __msa_copy_u_w((v4i32)dst0_m, 0); \ + out1_m = __msa_copy_u_w((v4i32)dst0_m, 1); \ + out2_m = __msa_copy_u_w((v4i32)dst1_m, 0); \ + out3_m = __msa_copy_u_w((v4i32)dst1_m, 1); \ + SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ +} + +/* Description : Pack even elements of input vectors & xor with 128 + Arguments : Inputs - in0, in1 + Outputs - out_m + Return Type - unsigned byte + Details : Signed byte even elements from 'in0' and 'in1' are packed + together in one vector and the resulting vector is xor'ed with + 128 to shift the range from signed to unsigned byte +*/ +#define PCKEV_XORI128_UB(in0, in1) ({ \ + v16u8 out_m; \ + \ + out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ + out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ + out_m; \ +}) + +/* Description : Converts inputs to unsigned bytes, interleave, average & store + as 8x4 unsigned byte block + Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, + pdst, stride +*/ +#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \ + dst0, dst1, dst2, dst3, pdst, stride) { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + tmp0_m = PCKEV_XORI128_UB(in0, in1); \ + tmp1_m = PCKEV_XORI128_UB(in2, in3); \ + ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ +} + +/* Description : Pack even byte elements and store byte vector in destination + memory + Arguments : Inputs - in0, in1, pdst +*/ +#define PCKEV_ST_SB(in0, in1, pdst) { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ + ST_SB(tmp_m, (pdst)); \ +} + +/* Description : Horizontal 2 tap filter kernel code + Arguments : Inputs - in0, in1, mask, coeff, shift +*/ +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \ + v16i8 tmp0_m; \ + v8u16 tmp1_m; \ + \ + tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ + tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ + tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ + tmp1_m = __msa_sat_u_h(tmp1_m, shift); \ + \ + tmp1_m; \ +}) #endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */ diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c new file mode 100644 index 00000000000..64cb9a81835 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) { + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + uint64_t src0_d, src1_d, dst0_d, dst1_d; + v16i8 src0 = { 0 }; + v16i8 src1 = { 0 }; + v16i8 dst0 = { 0 }; + v16i8 dst1 = { 0 }; + v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 2; row--;) { + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2(dst_ptr, dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src0); + INSERT_D2_SB(dst0_d, dst1_d, dst0); + + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src1); + INSERT_D2_SB(dst0_d, dst1_d, dst1); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst0, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst1, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + } +} + +static void filter_by_weight16x16_msa(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + int32_t src_weight) { + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 4; row--;) { + LD_SB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src2, src_r, src_l); + UNPCK_UB_SH(dst2, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src3, src_r, src_l); + UNPCK_UB_SH(dst3, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + } +} + +void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int src_weight) { + filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight); +} + +void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int src_weight) { + filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c index 222b88eff5d..8eda491de93 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c @@ -11,6 +11,7 @@ #include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_entropymv.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h index 018a9c2b975..64d379cab34 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h @@ -18,74 +18,28 @@ #include "vpx_scale/yv12config.h" #include "vp9/common/vp9_common_data.h" -#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_scale.h" +#include "vp9/common/vp9_seg_common.h" #ifdef __cplusplus extern "C" { #endif -#define BLOCK_SIZE_GROUPS 4 -#define SKIP_CONTEXTS 3 -#define INTER_MODE_CONTEXTS 7 - -/* Segment Feature Masks */ -#define MAX_MV_REF_CANDIDATES 2 - -#define INTRA_INTER_CONTEXTS 4 -#define COMP_INTER_CONTEXTS 5 -#define REF_CONTEXTS 5 - -typedef enum { - PLANE_TYPE_Y = 0, - PLANE_TYPE_UV = 1, - PLANE_TYPES -} PLANE_TYPE; - #define MAX_MB_PLANE 3 -typedef char ENTROPY_CONTEXT; - -static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, - ENTROPY_CONTEXT b) { - return (a != 0) + (b != 0); -} - typedef enum { KEY_FRAME = 0, INTER_FRAME = 1, FRAME_TYPES, } FRAME_TYPE; -typedef enum { - DC_PRED, // Average of above and left pixels - V_PRED, // Vertical - H_PRED, // Horizontal - D45_PRED, // Directional 45 deg = round(arctan(1/1) * 180/pi) - D135_PRED, // Directional 135 deg = 180 - 45 - D117_PRED, // Directional 117 deg = 180 - 63 - D153_PRED, // Directional 153 deg = 180 - 27 - D207_PRED, // Directional 207 deg = 180 + 27 - D63_PRED, // Directional 63 deg = round(arctan(2/1) * 180/pi) - TM_PRED, // True-motion - NEARESTMV, - NEARMV, - ZEROMV, - NEWMV, - MB_MODE_COUNT -} PREDICTION_MODE; - static INLINE int is_inter_mode(PREDICTION_MODE mode) { return mode >= NEARESTMV && mode <= NEWMV; } -#define INTRA_MODES (TM_PRED + 1) - -#define INTER_MODES (1 + NEWMV - NEARESTMV) - -#define INTER_OFFSET(mode) ((mode) - NEARESTMV) - /* For keyframes, intra block modes are predicted by the (already decoded) modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ @@ -96,16 +50,16 @@ typedef struct { } b_mode_info; // Note that the rate-distortion optimization loop, bit-stream writer, and -// decoder implementation modules critically rely on the enum entry values +// decoder implementation modules critically rely on the defined entry values // specified herein. They should be refactored concurrently. -typedef enum { - NONE = -1, - INTRA_FRAME = 0, - LAST_FRAME = 1, - GOLDEN_FRAME = 2, - ALTREF_FRAME = 3, - MAX_REF_FRAMES = 4 -} MV_REFERENCE_FRAME; + +#define NONE -1 +#define INTRA_FRAME 0 +#define LAST_FRAME 1 +#define GOLDEN_FRAME 2 +#define ALTREF_FRAME 3 +#define MAX_REF_FRAMES 4 +typedef int8_t MV_REFERENCE_FRAME; // This structure now relates to 8x8 block regions. typedef struct { @@ -121,12 +75,17 @@ typedef struct { PREDICTION_MODE uv_mode; // Only for INTER blocks + INTERP_FILTER interp_filter; MV_REFERENCE_FRAME ref_frame[2]; + + // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead. int_mv mv[2]; + +#if CONFIG_VP9_ENCODER + // TODO(slavarnway): Move to encoder int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; uint8_t mode_context[MAX_REF_FRAMES]; - INTERP_FILTER interp_filter; - +#endif } MB_MODE_INFO; typedef struct MODE_INFO { @@ -170,9 +129,12 @@ struct macroblockd_plane { int subsampling_y; struct buf_2d dst; struct buf_2d pre[2]; - const int16_t *dequant; ENTROPY_CONTEXT *above_context; ENTROPY_CONTEXT *left_context; + int16_t seg_dequant[MAX_SEGMENTS][2]; + + // encoder + const int16_t *dequant; }; #define BLOCK_OFFSET(x, i) ((x) + (i) * 16) @@ -187,7 +149,7 @@ typedef struct RefBuffer { typedef struct macroblockd { struct macroblockd_plane plane[MAX_MB_PLANE]; - + FRAME_COUNTS *counts; int mi_stride; MODE_INFO **mi; @@ -199,12 +161,17 @@ typedef struct macroblockd { int up_available; int left_available; + const vp9_prob (*partition_probs)[PARTITION_TYPES - 1]; + /* Distance of MB away from frame edges */ int mb_to_left_edge; int mb_to_right_edge; int mb_to_top_edge; int mb_to_bottom_edge; + FRAME_CONTEXT *fc; + int frame_parallel_decoding_mode; + /* pointers to reference frames */ RefBuffer *block_refs[2]; @@ -217,13 +184,9 @@ typedef struct macroblockd { PARTITION_CONTEXT *above_seg_context; PARTITION_CONTEXT left_seg_context[8]; - /* mc buffer */ - DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]); - #if CONFIG_VP9_HIGHBITDEPTH /* Bit depth: 8, 10, 12 */ int bd; - DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]); #endif /* dqcoeff are shared by all the planes. So planes must be decoded serially */ @@ -285,6 +248,27 @@ static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y]; } +static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + struct macroblockd_plane *const pd = &xd->plane[i]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + memset(pd->above_context, 0, + sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]); + memset(pd->left_context, 0, + sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]); + } +} + +static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi, + const MODE_INFO *above_mi, + const MODE_INFO *left_mi, + int block) { + const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block); + const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block); + return vp9_kf_y_mode_prob[above][left]; +} + typedef void (*foreach_transformed_block_visitor)(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h index d06b8e0405e..9c2d7791e75 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h @@ -27,12 +27,6 @@ extern "C" { #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) -#define ROUND_POWER_OF_TWO(value, n) \ - (((value) + (1 << ((n) - 1))) >> (n)) - -#define ALIGN_POWER_OF_TWO(value, n) \ - (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) - // Only need this for fixed-size arrays, for structs just assign. #define vp9_copy(dest, src) { \ assert(sizeof(dest) == sizeof(src)); \ @@ -83,9 +77,6 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) { typedef int64_t tran_high_t; typedef int32_t tran_low_t; -#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1)) -#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1 )) - #else // Note: diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c index a2584e8da5b..ad6c04bcc46 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c @@ -133,12 +133,6 @@ const uint8_t vp9_pt_energy_class[ENTROPY_TOKENS] = { 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5 }; -const vp9_tree_index vp9_coefmodel_tree[TREE_SIZE(UNCONSTRAINED_NODES + 1)] = { - -EOB_MODEL_TOKEN, 2, - -ZERO_TOKEN, 4, - -ONE_TOKEN, -TWO_TOKEN, -}; - // Model obtained from a 2-sided zero-centerd distribuition derived // from a Pareto distribution. The cdf of the distribution is: // cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta] diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h index 5a9007b5417..2fc97c3f9ed 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h @@ -14,8 +14,8 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" #include "vp9/common/vp9_prob.h" -#include "vp9/common/vp9_scan.h" #ifdef __cplusplus extern "C" { @@ -74,7 +74,6 @@ DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high12[18]); #endif // CONFIG_VP9_HIGHBITDEPTH #define EOB_MODEL_TOKEN 3 -extern const vp9_tree_index vp9_coefmodel_tree[]; typedef struct { const vp9_tree_index *tree; @@ -137,18 +136,6 @@ struct VP9Common; void vp9_default_coef_probs(struct VP9Common *cm); void vp9_adapt_coef_probs(struct VP9Common *cm); -static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) { - struct macroblockd_plane *const pd = &xd->plane[i]; - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); - memset(pd->above_context, 0, - sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]); - memset(pd->left_context, 0, - sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]); - } -} - // This is the index in the scan order beyond which all coefficients for // 8x8 transform and above are in the top band. // This macro is currently unused but may be used by certain implementations @@ -185,6 +172,13 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full); +typedef char ENTROPY_CONTEXT; + +static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, + ENTROPY_CONTEXT b) { + return (a != 0) + (b != 0); +} + static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l) { ENTROPY_CONTEXT above_ec = 0, left_ec = 0; @@ -214,18 +208,6 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, return combine_entropy_contexts(above_ec, left_ec); } -static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, - PLANE_TYPE type, int block_idx) { - const MODE_INFO *const mi = xd->mi[0]; - - if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) { - return &vp9_default_scan_orders[tx_size]; - } else { - const PREDICTION_MODE mode = get_y_mode(mi, block_idx); - return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]]; - } -} - #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c index 424451fee39..22d431bb22d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c @@ -314,7 +314,7 @@ static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] { 149, 144, }, }; -void vp9_init_mode_probs(FRAME_CONTEXT *fc) { +static void init_mode_probs(FRAME_CONTEXT *fc) { vp9_copy(fc->uv_mode_prob, default_if_uv_probs); vp9_copy(fc->y_mode_prob, default_if_y_probs); vp9_copy(fc->switchable_interp_prob, default_switchable_interp_prob); @@ -444,7 +444,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { lf->last_sharpness_level = -1; vp9_default_coef_probs(cm); - vp9_init_mode_probs(cm->fc); + init_mode_probs(cm->fc); vp9_init_mv_probs(cm); cm->fc->initialized = 1; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h index f4e20e1af8b..8c9e6a7319d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h @@ -11,7 +11,7 @@ #ifndef VP9_COMMON_VP9_ENTROPYMODE_H_ #define VP9_COMMON_VP9_ENTROPYMODE_H_ -#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymv.h" @@ -19,8 +19,12 @@ extern "C" { #endif +#define BLOCK_SIZE_GROUPS 4 + #define TX_SIZE_CONTEXTS 2 +#define INTER_OFFSET(mode) ((mode) - NEARESTMV) + struct VP9Common; struct tx_probs { @@ -86,8 +90,6 @@ extern const vp9_tree_index vp9_switchable_interp_tree void vp9_setup_past_independence(struct VP9Common *cm); -void vp9_init_mode_probs(FRAME_CONTEXT *fc); - void vp9_adapt_mode_probs(struct VP9Common *cm); void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p, @@ -97,15 +99,6 @@ void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, unsigned int (*ct_8x8p)[2]); -static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi, - const MODE_INFO *above_mi, - const MODE_INFO *left_mi, - int block) { - const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block); - const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block); - return vp9_kf_y_mode_prob[above][left]; -} - #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h index 7938fc10a11..d089f23f970 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h @@ -12,6 +12,7 @@ #define VP9_COMMON_VP9_ENUMS_H_ #include "./vpx_config.h" +#include "vpx/vpx_integer.h" #ifdef __cplusplus extern "C" { @@ -40,23 +41,22 @@ typedef enum BITSTREAM_PROFILE { MAX_PROFILES } BITSTREAM_PROFILE; -typedef enum BLOCK_SIZE { - BLOCK_4X4, - BLOCK_4X8, - BLOCK_8X4, - BLOCK_8X8, - BLOCK_8X16, - BLOCK_16X8, - BLOCK_16X16, - BLOCK_16X32, - BLOCK_32X16, - BLOCK_32X32, - BLOCK_32X64, - BLOCK_64X32, - BLOCK_64X64, - BLOCK_SIZES, - BLOCK_INVALID = BLOCK_SIZES -} BLOCK_SIZE; +#define BLOCK_4X4 0 +#define BLOCK_4X8 1 +#define BLOCK_8X4 2 +#define BLOCK_8X8 3 +#define BLOCK_8X16 4 +#define BLOCK_16X8 5 +#define BLOCK_16X16 6 +#define BLOCK_16X32 7 +#define BLOCK_32X16 8 +#define BLOCK_32X32 9 +#define BLOCK_32X64 10 +#define BLOCK_64X32 11 +#define BLOCK_64X64 12 +#define BLOCK_SIZES 13 +#define BLOCK_INVALID BLOCK_SIZES +typedef uint8_t BLOCK_SIZE; typedef enum PARTITION_TYPE { PARTITION_NONE, @@ -72,13 +72,12 @@ typedef char PARTITION_CONTEXT; #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET) // block transform size -typedef enum { - TX_4X4 = 0, // 4x4 transform - TX_8X8 = 1, // 8x8 transform - TX_16X16 = 2, // 16x16 transform - TX_32X32 = 3, // 32x32 transform - TX_SIZES -} TX_SIZE; +typedef uint8_t TX_SIZE; +#define TX_4X4 ((TX_SIZE)0) // 4x4 transform +#define TX_8X8 ((TX_SIZE)1) // 8x8 transform +#define TX_16X16 ((TX_SIZE)2) // 16x16 transform +#define TX_32X32 ((TX_SIZE)3) // 32x32 transform +#define TX_SIZES ((TX_SIZE)4) // frame transform mode typedef enum { @@ -104,6 +103,43 @@ typedef enum { VP9_ALT_FLAG = 1 << 2, } VP9_REFFRAME; +typedef enum { + PLANE_TYPE_Y = 0, + PLANE_TYPE_UV = 1, + PLANE_TYPES +} PLANE_TYPE; + +#define DC_PRED 0 // Average of above and left pixels +#define V_PRED 1 // Vertical +#define H_PRED 2 // Horizontal +#define D45_PRED 3 // Directional 45 deg = round(arctan(1/1) * 180/pi) +#define D135_PRED 4 // Directional 135 deg = 180 - 45 +#define D117_PRED 5 // Directional 117 deg = 180 - 63 +#define D153_PRED 6 // Directional 153 deg = 180 - 27 +#define D207_PRED 7 // Directional 207 deg = 180 + 27 +#define D63_PRED 8 // Directional 63 deg = round(arctan(2/1) * 180/pi) +#define TM_PRED 9 // True-motion +#define NEARESTMV 10 +#define NEARMV 11 +#define ZEROMV 12 +#define NEWMV 13 +#define MB_MODE_COUNT 14 +typedef uint8_t PREDICTION_MODE; + +#define INTRA_MODES (TM_PRED + 1) + +#define INTER_MODES (1 + NEWMV - NEARESTMV) + +#define SKIP_CONTEXTS 3 +#define INTER_MODE_CONTEXTS 7 + +/* Segment Feature Masks */ +#define MAX_MV_REF_CANDIDATES 2 + +#define INTRA_INTER_CONTEXTS 4 +#define COMP_INTER_CONTEXTS 5 +#define REF_CONTEXTS 5 + #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.c index afcdf22ec63..14654a5ab2a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.c @@ -12,7 +12,8 @@ #include "vp9/common/vp9_filter.h" -const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = { +DECLARE_ALIGNED(256, static const InterpKernel, + bilinear_filters[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, { 0, 0, 0, 112, 16, 0, 0, 0 }, @@ -32,8 +33,8 @@ const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = { }; // Lagrangian interpolation filter -DECLARE_ALIGNED(256, const InterpKernel, - vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = { +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, { 0, 1, -5, 126, 8, -3, 1, 0}, { -1, 3, -10, 122, 18, -6, 2, 0}, @@ -53,8 +54,8 @@ DECLARE_ALIGNED(256, const InterpKernel, }; // DCT based filter -DECLARE_ALIGNED(256, const InterpKernel, - vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = { +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8s[SUBPEL_SHIFTS]) = { {0, 0, 0, 128, 0, 0, 0, 0}, {-1, 3, -7, 127, 8, -3, 1, 0}, {-2, 5, -13, 125, 17, -6, 3, -1}, @@ -74,8 +75,8 @@ DECLARE_ALIGNED(256, const InterpKernel, }; // freqmultiplier = 0.5 -DECLARE_ALIGNED(256, const InterpKernel, - vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, {-3, -1, 32, 64, 38, 1, -3, 0}, {-2, -2, 29, 63, 41, 2, -3, 0}, @@ -95,15 +96,15 @@ DECLARE_ALIGNED(256, const InterpKernel, }; -static const InterpKernel* vp9_filter_kernels[4] = { - vp9_sub_pel_filters_8, - vp9_sub_pel_filters_8lp, - vp9_sub_pel_filters_8s, - vp9_bilinear_filters +static const InterpKernel* filter_kernels[4] = { + sub_pel_filters_8, + sub_pel_filters_8lp, + sub_pel_filters_8s, + bilinear_filters }; const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter) { assert(filter != SWITCHABLE); - return vp9_filter_kernels[filter]; + return filter_kernels[filter]; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h index d963ee23569..13d38affbaa 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h @@ -27,30 +27,21 @@ extern "C" { #define SUBPEL_SHIFTS (1 << SUBPEL_BITS) #define SUBPEL_TAPS 8 -typedef enum { - EIGHTTAP = 0, - EIGHTTAP_SMOOTH = 1, - EIGHTTAP_SHARP = 2, - SWITCHABLE_FILTERS = 3, /* Number of switchable filters */ - BILINEAR = 3, - // The codec can operate in four possible inter prediction filter mode: - // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. - SWITCHABLE_FILTER_CONTEXTS = SWITCHABLE_FILTERS + 1, - SWITCHABLE = 4 /* should be the last one */ -} INTERP_FILTER; +#define EIGHTTAP 0 +#define EIGHTTAP_SMOOTH 1 +#define EIGHTTAP_SHARP 2 +#define SWITCHABLE_FILTERS 3 /* Number of switchable filters */ +#define BILINEAR 3 +// The codec can operate in four possible inter prediction filter mode: +// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. +#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) +#define SWITCHABLE 4 /* should be the last one */ +typedef uint8_t INTERP_FILTER; typedef int16_t InterpKernel[SUBPEL_TAPS]; const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter); -DECLARE_ALIGNED(256, extern const InterpKernel, - vp9_bilinear_filters[SUBPEL_SHIFTS]); - -// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear -// filter kernel as a 2 tap filter. -#define BILINEAR_FILTERS_2TAP(x) \ - (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1) - #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c index 3b214371c47..174b96e21ad 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c @@ -11,6 +11,7 @@ #include <math.h> #include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_systemdependent.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h index 6e2551dd4bc..cee1682a67f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h @@ -14,6 +14,7 @@ #include <assert.h> #include "./vpx_config.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c index 69d393ef469..9816728364a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c @@ -13,6 +13,7 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_reconinter.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_seg_common.h" @@ -266,8 +267,8 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { int lvl_seg = default_filt_lvl; - if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) { - const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF); + if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) { + const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF); lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ? data : default_filt_lvl + data, 0, MAX_LOOP_FILTER); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter_filters.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter_filters.c index 2e32c40b85e..3cf4c32253e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter_filters.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter_filters.c @@ -9,6 +9,7 @@ */ #include "./vpx_config.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_onyxc_int.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.c index 57189df16ec..bebb37eda07 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.c @@ -171,13 +171,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u, get_thr(bs, qdiff, &sad_thr, &vdiff_thr); if (bs == BLOCK_16X16) { - vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8; + vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8; sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; } else if (bs == BLOCK_32X32) { - vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10; + vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10; sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10; } else /* if (bs == BLOCK_64X64) */ { - vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12; + vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12; sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c index 51e147e0056..5f8ee0fcc50 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c @@ -18,7 +18,8 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, int block, int mi_row, int mi_col, - find_mv_refs_sync sync, void *const data) { + find_mv_refs_sync sync, void *const data, + uint8_t *mode_context) { const int *ref_sign_bias = cm->ref_frame_sign_bias; int i, refmv_count = 0; const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; @@ -138,7 +139,7 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, Done: - mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter]; + mode_context[ref_frame] = counter_to_context[context_counter]; // Clamp vectors for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) @@ -150,9 +151,10 @@ void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, int mi_row, int mi_col, - find_mv_refs_sync sync, void *const data) { + find_mv_refs_sync sync, void *const data, + uint8_t *mode_context) { find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1, - mi_row, mi_col, sync, data); + mi_row, mi_col, sync, data, mode_context); } static void lower_mv_precision(MV *mv, int allow_hp) { @@ -181,7 +183,8 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, const TileInfo *const tile, int block, int ref, int mi_row, int mi_col, - int_mv *nearest_mv, int_mv *near_mv) { + int_mv *nearest_mv, int_mv *near_mv, + uint8_t *mode_context) { int_mv mv_list[MAX_MV_REF_CANDIDATES]; MODE_INFO *const mi = xd->mi[0]; b_mode_info *bmi = mi->bmi; @@ -190,7 +193,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, assert(MAX_MV_REF_CANDIDATES == 2); find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block, - mi_row, mi_col, NULL, NULL); + mi_row, mi_col, NULL, NULL, mode_context); near_mv->as_int = 0; switch (block) { @@ -223,6 +226,6 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, break; } default: - assert("Invalid block index."); + assert(0 && "Invalid block index."); } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h index f1df521468f..621dc14be65 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h @@ -212,7 +212,8 @@ void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, const TileInfo *const tile, MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, int mi_row, int mi_col, - find_mv_refs_sync sync, void *const data); + find_mv_refs_sync sync, void *const data, + uint8_t *mode_context); // check a list of motion vectors by sad score using a number rows of pixels // above and a number cols of pixels in the left to select the one with best @@ -223,7 +224,8 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, const TileInfo *const tile, int block, int ref, int mi_row, int mi_col, - int_mv *nearest_mv, int_mv *near_mv); + int_mv *nearest_mv, int_mv *near_mv, + uint8_t *mode_context); #ifdef __cplusplus } // extern "C" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h index f710f810618..1811d76dfd3 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vp9_rtcd.h" +#include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" @@ -161,7 +162,8 @@ typedef struct VP9Common { int show_existing_frame; // Flag signaling that the frame is encoded using only INTRA modes. - int intra_only; + uint8_t intra_only; + uint8_t last_intra_only; int allow_high_precision_mv; @@ -263,6 +265,7 @@ typedef struct VP9Common { int log2_tile_cols, log2_tile_rows; int byte_alignment; + int skip_loop_filter; // Private data associated with the frame buffer callbacks. void *cb_priv; @@ -307,8 +310,13 @@ static INLINE int get_free_fb(VP9_COMMON *cm) { if (frame_bufs[i].ref_count == 0) break; - assert(i < FRAME_BUFFERS); - frame_bufs[i].ref_count = 1; + if (i != FRAME_BUFFERS) { + frame_bufs[i].ref_count = 1; + } else { + // Reset i to be INVALID_IDX to indicate no free buffer found. + i = INVALID_IDX; + } + unlock_buffer_pool(cm->buffer_pool); return i; } @@ -328,6 +336,18 @@ static INLINE int mi_cols_aligned_to_sb(int n_mis) { return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2); } +static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { + return cm->frame_type == KEY_FRAME || cm->intra_only; +} + +static INLINE void set_partition_probs(const VP9_COMMON *const cm, + MACROBLOCKD *const xd) { + xd->partition_probs = + frame_is_intra_only(cm) ? + &vp9_kf_partition_probs[0] : + (const vp9_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob; +} + static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) { int i; @@ -335,21 +355,26 @@ static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) { xd->plane[i].dqcoeff = xd->dqcoeff; xd->above_context[i] = cm->above_context + i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols); + + if (xd->plane[i].plane_type == PLANE_TYPE_Y) { + memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant)); + } else { + memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant)); + } + xd->fc = cm->fc; + xd->frame_parallel_decoding_mode = cm->frame_parallel_decoding_mode; } xd->above_seg_context = cm->above_seg_context; xd->mi_stride = cm->mi_stride; xd->error_info = &cm->error; -} -static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { - return cm->frame_type == KEY_FRAME || cm->intra_only; + set_partition_probs(cm, xd); } -static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm, +static INLINE const vp9_prob* get_partition_probs(const MACROBLOCKD *xd, int ctx) { - return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx] - : cm->fc->partition_prob[ctx]; + return xd->partition_probs[ctx]; } static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c index 983a4744dd6..d26a6eb5c88 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c @@ -16,12 +16,10 @@ #include "./vpx_scale_rtcd.h" #include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" #include "vpx_scale/vpx_scale.h" #include "vpx_scale/yv12config.h" -#if CONFIG_VP9_HIGHBITDEPTH -#include "vp9/common/vp9_common.h" -#endif #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_postproc.h" #include "vp9/common/vp9_systemdependent.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.c index 564a3eb0ce3..d83f3c1a2f5 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_quant_common.c @@ -266,8 +266,8 @@ int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) { int vp9_get_qindex(const struct segmentation *seg, int segment_id, int base_qindex) { - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { - const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q); + if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { + const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ? data : base_qindex + data; return clamp(seg_qindex, 0, MAXQ); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c index 825d03d69b2..1e9acb8d5d3 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c @@ -12,6 +12,7 @@ #include "./vp9_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vpx_ports/vpx_once.h" #include "vp9/common/vp9_reconintra.h" @@ -78,6 +79,15 @@ static const uint8_t extend_modes[INTRA_MODES] = { intra_pred_highbd_sized(type, 16) \ intra_pred_highbd_sized(type, 32) +#define intra_pred_no_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) + #else #define intra_pred_allsizes(type) \ @@ -85,8 +95,17 @@ static const uint8_t extend_modes[INTRA_MODES] = { intra_pred_sized(type, 8) \ intra_pred_sized(type, 16) \ intra_pred_sized(type, 32) + +#define intra_pred_no_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) #endif // CONFIG_VP9_HIGHBITDEPTH +#define DST(x, y) dst[(x) + (y) * stride] +#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) +#define AVG2(a, b) (((a) + (b) + 1) >> 1) + #if CONFIG_VP9_HIGHBITDEPTH static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, @@ -97,18 +116,16 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride, // First column. for (r = 0; r < bs - 1; ++r) { - dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1); + dst[r * stride] = AVG2(left[r], left[r + 1]); } dst[(bs - 1) * stride] = left[bs - 1]; dst++; // Second column. for (r = 0; r < bs - 2; ++r) { - dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1] * 2 + - left[r + 2], 2); + dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]); } - dst[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left[bs - 2] + - left[bs - 1] * 3, 2); + dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]); dst[(bs - 1) * stride] = left[bs - 1]; dst++; @@ -130,11 +147,9 @@ static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, (void) bd; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) { - dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] + - above[r/2 + c + 1] * 2 + - above[r/2 + c + 2], 2) - : ROUND_POWER_OF_TWO(above[r/2 + c] + - above[r/2 + c + 1], 1); + dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], + above[(r >> 1) + c + 2]) + : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); } dst += stride; } @@ -148,9 +163,8 @@ static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs, (void) bd; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) { - dst[c] = r + c + 2 < bs * 2 ? ROUND_POWER_OF_TWO(above[r + c] + - above[r + c + 1] * 2 + - above[r + c + 2], 2) + dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1], + above[r + c + 2]) : above[bs * 2 - 1]; } dst += stride; @@ -165,20 +179,19 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, // first row for (c = 0; c < bs; c++) - dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c], 1); + dst[c] = AVG2(above[c - 1], above[c]); dst += stride; // second row - dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); + dst[0] = AVG3(left[0], above[-1], above[0]); for (c = 1; c < bs; c++) - dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2); + dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); dst += stride; // the rest of first col - dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + dst[0] = AVG3(above[-1], left[0], left[1]); for (r = 3; r < bs; ++r) - dst[(r - 2) * stride] = ROUND_POWER_OF_TWO(left[r - 3] + left[r - 2] * 2 + - left[r - 1], 2); + dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); // the rest of the block for (r = 2; r < bs; ++r) { @@ -193,14 +206,13 @@ static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int bd) { int r, c; (void) bd; - dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); + dst[0] = AVG3(left[0], above[-1], above[0]); for (c = 1; c < bs; c++) - dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2); + dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); - dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + dst[stride] = AVG3(above[-1], left[0], left[1]); for (r = 2; r < bs; ++r) - dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 + - left[r], 2); + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); dst += stride; for (r = 1; r < bs; ++r) { @@ -215,20 +227,19 @@ static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int bd) { int r, c; (void) bd; - dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0], 1); + dst[0] = AVG2(above[-1], left[0]); for (r = 1; r < bs; r++) - dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 1] + left[r], 1); + dst[r * stride] = AVG2(left[r - 1], left[r]); dst++; - dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); - dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + dst[0] = AVG3(left[0], above[-1], above[0]); + dst[stride] = AVG3(above[-1], left[0], left[1]); for (r = 2; r < bs; r++) - dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 + - left[r], 2); + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); dst++; for (c = 0; c < bs - 2; c++) - dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c] * 2 + above[c + 1], 2); + dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); dst += stride; for (r = 1; r < bs; ++r) { @@ -344,22 +355,37 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, } #endif // CONFIG_VP9_HIGHBITDEPTH +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + (void)above; + DST(0, 0) = AVG2(I, J); + DST(2, 0) = DST(0, 1) = AVG2(J, K); + DST(2, 1) = DST(0, 2) = AVG2(K, L); + DST(1, 0) = AVG3(I, J, K); + DST(3, 0) = DST(1, 1) = AVG3(J, K, L); + DST(3, 1) = DST(1, 2) = AVG3(K, L, L); + DST(3, 2) = DST(2, 2) = + DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; +} + static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; (void) above; // first column for (r = 0; r < bs - 1; ++r) - dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1); + dst[r * stride] = AVG2(left[r], left[r + 1]); dst[(bs - 1) * stride] = left[bs - 1]; dst++; // second column for (r = 0; r < bs - 2; ++r) - dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1] * 2 + - left[r + 2], 2); - dst[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left[bs - 2] + - left[bs - 1] * 3, 2); + dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]); + dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]); dst[(bs - 1) * stride] = left[bs - 1]; dst++; @@ -371,38 +397,112 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, for (c = 0; c < bs - 2; ++c) dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; } -intra_pred_allsizes(d207) +intra_pred_no_4x4(d207) + +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + (void)left; + DST(0, 0) = AVG2(A, B); + DST(1, 0) = DST(0, 2) = AVG2(B, C); + DST(2, 0) = DST(1, 2) = AVG2(C, D); + DST(3, 0) = DST(2, 2) = AVG2(D, E); + DST(3, 2) = AVG2(E, F); // differs from vp8 + + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = DST(0, 3) = AVG3(B, C, D); + DST(2, 1) = DST(1, 3) = AVG3(C, D, E); + DST(3, 1) = DST(2, 3) = AVG3(D, E, F); + DST(3, 3) = AVG3(E, F, G); // differs from vp8 +} static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; - (void) left; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) - dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] + - above[r/2 + c + 1] * 2 + - above[r/2 + c + 2], 2) - : ROUND_POWER_OF_TWO(above[r/2 + c] + - above[r/2 + c + 1], 1); - dst += stride; + int size; + (void)left; + for (c = 0; c < bs; ++c) { + dst[c] = AVG2(above[c], above[c + 1]); + dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]); + } + for (r = 2, size = bs - 2; r < bs; r += 2, --size) { + memcpy(dst + (r + 0) * stride, dst + (r >> 1), size); + memset(dst + (r + 0) * stride + size, above[bs - 1], bs - size); + memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), size); + memset(dst + (r + 1) * stride + size, above[bs - 1], bs - size); } } -intra_pred_allsizes(d63) +intra_pred_no_4x4(d63) + +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)stride; + (void)left; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = H; // differs from vp8 +} static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { - int r, c; - (void) left; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) - dst[c] = r + c + 2 < bs * 2 ? ROUND_POWER_OF_TWO(above[r + c] + - above[r + c + 1] * 2 + - above[r + c + 2], 2) - : above[bs * 2 - 1]; + const uint8_t above_right = above[bs - 1]; + const uint8_t *const dst_row0 = dst; + int x, size; + (void)left; + + for (x = 0; x < bs - 1; ++x) { + dst[x] = AVG3(above[x], above[x + 1], above[x + 2]); + } + dst[bs - 1] = above_right; + dst += stride; + for (x = 1, size = bs - 2; x < bs; ++x, --size) { + memcpy(dst, dst_row0 + x, size); + memset(dst + size, above_right, x + 1); dst += stride; } } -intra_pred_allsizes(d45) +intra_pred_no_4x4(d45) + +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + DST(0, 0) = DST(1, 2) = AVG2(X, A); + DST(1, 0) = DST(2, 2) = AVG2(A, B); + DST(2, 0) = DST(3, 2) = AVG2(B, C); + DST(3, 0) = AVG2(C, D); + + DST(0, 3) = AVG3(K, J, I); + DST(0, 2) = AVG3(J, I, X); + DST(0, 1) = DST(1, 3) = AVG3(I, X, A); + DST(1, 1) = DST(2, 3) = AVG3(X, A, B); + DST(2, 1) = DST(3, 3) = AVG3(A, B, C); + DST(3, 1) = AVG3(B, C, D); +} static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { @@ -410,20 +510,19 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, // first row for (c = 0; c < bs; c++) - dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c], 1); + dst[c] = AVG2(above[c - 1], above[c]); dst += stride; // second row - dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); + dst[0] = AVG3(left[0], above[-1], above[0]); for (c = 1; c < bs; c++) - dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2); + dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); dst += stride; // the rest of first col - dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + dst[0] = AVG3(above[-1], left[0], left[1]); for (r = 3; r < bs; ++r) - dst[(r - 2) * stride] = ROUND_POWER_OF_TWO(left[r - 3] + left[r - 2] * 2 + - left[r - 1], 2); + dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); // the rest of the block for (r = 2; r < bs; ++r) { @@ -432,19 +531,39 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, dst += stride; } } -intra_pred_allsizes(d117) +intra_pred_no_4x4(d117) + +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)stride; + DST(0, 3) = AVG3(J, K, L); + DST(1, 3) = DST(0, 2) = AVG3(I, J, K); + DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); + DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); + DST(3, 1) = DST(2, 0) = AVG3(C, B, A); + DST(3, 0) = AVG3(D, C, B); +} static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; - dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); + dst[0] = AVG3(left[0], above[-1], above[0]); for (c = 1; c < bs; c++) - dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2); + dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); - dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + dst[stride] = AVG3(above[-1], left[0], left[1]); for (r = 2; r < bs; ++r) - dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 + - left[r], 2); + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); dst += stride; for (r = 1; r < bs; ++r) { @@ -453,25 +572,48 @@ static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs, dst += stride; } } -intra_pred_allsizes(d135) +intra_pred_no_4x4(d135) + +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + + DST(0, 0) = DST(2, 1) = AVG2(I, X); + DST(0, 1) = DST(2, 2) = AVG2(J, I); + DST(0, 2) = DST(2, 3) = AVG2(K, J); + DST(0, 3) = AVG2(L, K); + + DST(3, 0) = AVG3(A, B, C); + DST(2, 0) = AVG3(X, A, B); + DST(1, 0) = DST(3, 1) = AVG3(I, X, A); + DST(1, 1) = DST(3, 2) = AVG3(J, I, X); + DST(1, 2) = DST(3, 3) = AVG3(K, J, I); + DST(1, 3) = AVG3(L, K, J); +} static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; - dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0], 1); + dst[0] = AVG2(above[-1], left[0]); for (r = 1; r < bs; r++) - dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 1] + left[r], 1); + dst[r * stride] = AVG2(left[r - 1], left[r]); dst++; - dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); - dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + dst[0] = AVG3(left[0], above[-1], above[0]); + dst[stride] = AVG3(above[-1], left[0], left[1]); for (r = 2; r < bs; r++) - dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 + - left[r], 2); + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); dst++; for (c = 0; c < bs - 2; c++) - dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c] * 2 + above[c + 1], 2); + dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); dst += stride; for (r = 1; r < bs; ++r) { @@ -480,7 +622,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, dst += stride; } } -intra_pred_allsizes(d153) +intra_pred_no_4x4(d153) static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { @@ -658,7 +800,7 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd, uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); DECLARE_ALIGNED(16, uint16_t, left_col[32]); - DECLARE_ALIGNED(16, uint16_t, above_data[128 + 16]); + DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]); uint16_t *above_row = above_data + 16; const uint16_t *const_above_row = above_row; const int bs = 4 << tx_size; @@ -781,7 +923,7 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, int plane) { int i; DECLARE_ALIGNED(16, uint8_t, left_col[32]); - DECLARE_ALIGNED(16, uint8_t, above_data[128 + 16]); + DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]); uint8_t *above_row = above_data + 16; const uint8_t *const_above_row = above_row; const int bs = 4 << tx_size; @@ -943,6 +1085,6 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, have_top, have_left, have_right, x, y, plane); } -void vp9_init_intra_predictors() { +void vp9_init_intra_predictors(void) { once(vp9_init_intra_predictors_internal); } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.h index 845f3bcaac7..da5e435b132 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.h @@ -18,7 +18,7 @@ extern "C" { #endif -void vp9_init_intra_predictors(); +void vp9_init_intra_predictors(void); void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, TX_SIZE tx_size, PREDICTION_MODE mode, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl index d05afa525c3..22a5efdd5b2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -60,52 +60,52 @@ add_proto qw/void vp9_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc"; +specialize qw/vp9_d45_predictor_4x4 neon/, "$ssse3_x86inc"; add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_4x4 neon dspr2 msa/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_4x4/; add_proto qw/void vp9_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d135_predictor_4x4/; +specialize qw/vp9_d135_predictor_4x4 neon/; add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc"; +specialize qw/vp9_v_predictor_4x4 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc"; +specialize qw/vp9_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc"; +specialize qw/vp9_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc"; add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_4x4/, "$sse_x86inc"; +specialize qw/vp9_dc_top_predictor_4x4 msa neon/, "$sse_x86inc"; add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_4x4/, "$sse_x86inc"; +specialize qw/vp9_dc_left_predictor_4x4 msa neon/, "$sse_x86inc"; add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_4x4/, "$sse_x86inc"; +specialize qw/vp9_dc_128_predictor_4x4 msa neon/, "$sse_x86inc"; add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc"; add_proto qw/void vp9_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d45_predictor_8x8/, "$ssse3_x86inc"; +specialize qw/vp9_d45_predictor_8x8 neon/, "$ssse3_x86inc"; add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_8x8 neon dspr2 msa/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_8x8/; @@ -117,34 +117,34 @@ add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc"; +specialize qw/vp9_v_predictor_8x8 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc"; +specialize qw/vp9_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc"; +specialize qw/vp9_dc_predictor_8x8 dspr2 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_8x8/, "$sse_x86inc"; +specialize qw/vp9_dc_top_predictor_8x8 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_8x8/, "$sse_x86inc"; +specialize qw/vp9_dc_left_predictor_8x8 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_8x8/, "$sse_x86inc"; +specialize qw/vp9_dc_128_predictor_8x8 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc"; add_proto qw/void vp9_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d45_predictor_16x16/, "$ssse3_x86inc"; +specialize qw/vp9_d45_predictor_16x16 neon/, "$ssse3_x86inc"; add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_16x16 neon dspr2 msa/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_16x16/; @@ -156,22 +156,22 @@ add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc"; +specialize qw/vp9_v_predictor_16x16 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc"; +specialize qw/vp9_tm_predictor_16x16 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc"; +specialize qw/vp9_dc_predictor_16x16 dspr2 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_16x16/, "$sse2_x86inc"; +specialize qw/vp9_dc_top_predictor_16x16 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_16x16/, "$sse2_x86inc"; +specialize qw/vp9_dc_left_predictor_16x16 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_16x16/, "$sse2_x86inc"; +specialize qw/vp9_dc_128_predictor_16x16 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc"; @@ -183,7 +183,7 @@ add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, c specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_32x32 neon msa/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_32x32/; @@ -192,68 +192,68 @@ add_proto qw/void vp9_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vp9_d135_predictor_32x32/; add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d153_predictor_32x32/; +specialize qw/vp9_d153_predictor_32x32/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc"; +specialize qw/vp9_v_predictor_32x32 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64"; +specialize qw/vp9_tm_predictor_32x32 neon msa/, "$sse2_x86_64"; add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc"; +specialize qw/vp9_dc_predictor_32x32 msa neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_32x32/, "$sse2_x86inc"; +specialize qw/vp9_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_32x32/, "$sse2_x86inc"; +specialize qw/vp9_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_32x32/, "$sse2_x86inc"; +specialize qw/vp9_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc"; # # Loopfilter # add_proto qw/void vp9_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vp9_lpf_vertical_16 sse2 neon_asm dspr2/; +specialize qw/vp9_lpf_vertical_16 sse2 neon_asm dspr2 msa/; $vp9_lpf_vertical_16_neon_asm=vp9_lpf_vertical_16_neon; add_proto qw/void vp9_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2/; +specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/; $vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon; add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2/; +specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2 msa/; $vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon; add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/; +specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; $vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon; add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/; +specialize qw/vp9_lpf_vertical_4 mmx neon dspr2 msa/; add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2/; +specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2 msa/; add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/; +specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/; $vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon; add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2/; +specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2 msa/; $vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon; add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/; +specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; $vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon; add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/; +specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2 msa/; add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2/; +specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2 msa/; # # post proc @@ -276,10 +276,10 @@ specialize qw/vp9_plane_add_noise sse2/; $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt; add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight"; -specialize qw/vp9_filter_by_weight16x16 sse2/; +specialize qw/vp9_filter_by_weight16x16 sse2 msa/; add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight"; -specialize qw/vp9_filter_by_weight8x8 sse2/; +specialize qw/vp9_filter_by_weight8x8 sse2 msa/; } # @@ -301,13 +301,13 @@ add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3"; add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/; +specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2 msa/; add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2/; +specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/; add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2/; +specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/; # # dct @@ -419,19 +419,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_iwht4x4_16_add/; } else { add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_1_add sse2 neon dspr2/; + specialize qw/vp9_idct4x4_1_add sse2 neon dspr2 msa/; add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_16_add sse2 neon dspr2/; + specialize qw/vp9_idct4x4_16_add sse2 neon dspr2 msa/; add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/; + specialize qw/vp9_idct8x8_1_add sse2 neon dspr2 msa/; add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64"; + specialize qw/vp9_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64"; + specialize qw/vp9_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vp9_idct16x16_1_add sse2 neon dspr2 msa/; @@ -454,10 +454,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_idct32x32_1_add sse2 neon dspr2 msa/; add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; - specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/; + specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/; add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; - specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/; + specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/; add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/; @@ -465,10 +465,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # dct and add add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_iwht4x4_1_add/; + specialize qw/vp9_iwht4x4_1_add msa/; add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_iwht4x4_16_add/; + specialize qw/vp9_iwht4x4_16_add msa/; } } @@ -797,51 +797,6 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { # variance -add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x32/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x16/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc"; - -add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get8x8var neon/, "$sse2_x86inc"; - -add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x4/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance4x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance4x4/, "$sse2_x86inc"; - add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -922,26 +877,11 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; -add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse8x16/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse16x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse8x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *"; -specialize qw/vp9_get_mb_ss/, "$sse2_x86inc"; - add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p"; -specialize qw/vp9_avg_8x8 sse2 neon/; +specialize qw/vp9_avg_8x8 sse2 neon msa/; add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p"; -specialize qw/vp9_avg_4x4 sse2/; +specialize qw/vp9_avg_4x4 sse2 msa/; add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; specialize qw/vp9_minmax_8x8 sse2/; @@ -969,14 +909,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_avg_8x8/; add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p"; specialize qw/vp9_highbd_avg_4x4/; - add_proto qw/unsigned int vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + add_proto qw/void vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; specialize qw/vp9_highbd_minmax_8x8/; } # ENCODEMB INVOKE add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vp9_subtract_block neon/, "$sse2_x86inc"; +specialize qw/vp9_subtract_block neon msa/, "$sse2_x86inc"; # # Denoiser @@ -1008,7 +948,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_fdct8x8_quant/; } else { add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; - specialize qw/vp9_block_error avx2/, "$sse2_x86inc"; + specialize qw/vp9_block_error avx2 msa/, "$sse2_x86inc"; add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size"; specialize qw/vp9_block_error_fp sse2/; @@ -1083,43 +1023,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_fdct32x32_rd sse2/; } else { add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht4x4 sse2/; + specialize qw/vp9_fht4x4 sse2 msa/; add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht8x8 sse2/; + specialize qw/vp9_fht8x8 sse2 msa/; add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht16x16 sse2/; + specialize qw/vp9_fht16x16 sse2 msa/; add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fwht4x4/, "$mmx_x86inc"; + specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc"; add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct4x4_1 sse2/; add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct4x4 sse2/; + specialize qw/vp9_fdct4x4 sse2 msa/; add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct8x8_1 sse2 neon/; + specialize qw/vp9_fdct8x8_1 sse2 neon msa/; add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64"; + specialize qw/vp9_fdct8x8 sse2 neon msa/, "$ssse3_x86_64"; add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct16x16_1 sse2/; + specialize qw/vp9_fdct16x16_1 sse2 msa/; add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct16x16 sse2/; + specialize qw/vp9_fdct16x16 sse2 msa/; add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct32x32_1 sse2/; + specialize qw/vp9_fdct32x32_1 sse2 msa/; add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct32x32 sse2 avx2/; + specialize qw/vp9_fdct32x32 sse2 avx2 msa/; add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct32x32_rd sse2 avx2/; + specialize qw/vp9_fdct32x32_rd sse2 avx2 msa/; } # @@ -1137,146 +1077,10 @@ add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const stru specialize qw/vp9_full_range_search/; add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; -specialize qw/vp9_temporal_filter_apply sse2/; +specialize qw/vp9_temporal_filter_apply sse2 msa/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - # variance - add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x4/; - - add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance4x4/; - - add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x4/; - - add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance4x4/; - - add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x4/; - - add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance4x4/; - - add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc"; - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc"; @@ -1511,41 +1315,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/; - add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse8x16/; - - add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse16x8/; - - add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse8x16/; - - add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse16x8/; - - add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse8x16/; - - add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse16x8/; - - add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc"; # ENCODEMB INVOKE diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.h index 65e2aa69a54..1d86b5cfe2c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_scan.h @@ -38,6 +38,18 @@ static INLINE int get_coef_context(const int16_t *neighbors, token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; } +static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx) { + const MODE_INFO *const mi = xd->mi[0]; + + if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) { + return &vp9_default_scan_orders[tx_size]; + } else { + const PREDICTION_MODE mode = get_y_mode(mi, block_idx); + return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]]; + } +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.c index 910200ecc9c..471e238ccff 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.c @@ -25,12 +25,6 @@ static const int seg_feature_data_max[SEG_LVL_MAX] = { // the coding mechanism is still subject to change so these provide a // convenient single point of change. -int vp9_segfeature_active(const struct segmentation *seg, int segment_id, - SEG_LVL_FEATURES feature_id) { - return seg->enabled && - (seg->feature_mask[segment_id] & (1 << feature_id)); -} - void vp9_clearall_segfeatures(struct segmentation *seg) { vp9_zero(seg->feature_data); vp9_zero(seg->feature_mask); @@ -60,12 +54,6 @@ void vp9_set_segdata(struct segmentation *seg, int segment_id, seg->feature_data[segment_id][feature_id] = seg_data; } -int vp9_get_segdata(const struct segmentation *seg, int segment_id, - SEG_LVL_FEATURES feature_id) { - return seg->feature_data[segment_id][feature_id]; -} - - const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = { 2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7 diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.h index ff2d66a3658..95c9918303d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_seg_common.h @@ -49,9 +49,12 @@ struct segmentation { unsigned int feature_mask[MAX_SEGMENTS]; }; -int vp9_segfeature_active(const struct segmentation *seg, - int segment_id, - SEG_LVL_FEATURES feature_id); +static INLINE int segfeature_active(const struct segmentation *seg, + int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->enabled && + (seg->feature_mask[segment_id] & (1 << feature_id)); +} void vp9_clearall_segfeatures(struct segmentation *seg); @@ -68,9 +71,10 @@ void vp9_set_segdata(struct segmentation *seg, SEG_LVL_FEATURES feature_id, int seg_data); -int vp9_get_segdata(const struct segmentation *seg, - int segment_id, - SEG_LVL_FEATURES feature_id); +static INLINE int get_segdata(const struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->feature_data[segment_id][feature_id]; +} extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)]; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h index 161c381ad08..fc77762def5 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h @@ -11,15 +11,14 @@ #ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ +#include "vpx_ports/msvc.h" + #ifdef _MSC_VER # include <math.h> // the ceil() definition must precede intrin.h # if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86)) # include <intrin.h> -# define USE_MSC_INTRIN +# define USE_MSC_INTRINSICS # endif -#if _MSC_VER < 1900 -# define snprintf _snprintf -#endif #endif #ifdef __cplusplus @@ -50,7 +49,7 @@ static INLINE int round(double x) { static INLINE int get_msb(unsigned int n) { return 31 ^ __builtin_clz(n); } -#elif defined(USE_MSC_INTRIN) +#elif defined(USE_MSC_INTRINSICS) #pragma intrinsic(_BitScanReverse) static INLINE int get_msb(unsigned int n) { @@ -58,7 +57,7 @@ static INLINE int get_msb(unsigned int n) { _BitScanReverse(&first_set_bit, n); return first_set_bit; } -#undef USE_MSC_INTRIN +#undef USE_MSC_INTRINSICS #else // Returns (int)floor(log2(n)). n must be > 0. static INLINE int get_msb(unsigned int n) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/convolve.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/convolve.h new file mode 100644 index 00000000000..de2df47e5e5 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/convolve.h @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VP9_COMMON_X86_CONVOLVE_H_ +#define VP9_COMMON_X86_CONVOLVE_H_ + +#include <assert.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +typedef void filter8_1dfunction ( + const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter +); + +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h); \ + } \ +} + +#define FUN_CONV_2D(avg, opt) \ +void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ + vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 7); \ + vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ + vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 1); \ + vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } \ + } else { \ + vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ + } \ +} + +#if CONFIG_VP9_HIGHBITDEPTH + +typedef void highbd_filter8_1dfunction ( + const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, + int bd +); + +#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ + ptrdiff_t src_stride, \ + uint8_t *dst8, \ + ptrdiff_t dst_stride, \ + const int16_t *filter_x, \ + int x_step_q4, \ + const int16_t *filter_y, \ + int y_step_q4, \ + int w, int h, int bd) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h, bd); \ + } \ +} + +#define HIGH_FUN_CONV_2D(avg, opt) \ +void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h, int bd) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h + 7, bd); \ + vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ + 64, dst, dst_stride, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ + vp9_highbd_convolve8_horiz_##opt(src, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h + 1, bd); \ + vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ + dst, dst_stride, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h, bd); \ + } \ + } else { \ + vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, \ + h, bd); \ + } \ +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VP9_COMMON_X86_CONVOLVE_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c index 963023c53b1..fd55fb8c664 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c @@ -8,421 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <assert.h> - -#include "./vpx_config.h" #include "./vp9_rtcd.h" -#include "vpx_ports/mem.h" - -typedef void filter8_1dfunction ( - const unsigned char *src_ptr, - const ptrdiff_t src_pitch, - unsigned char *output_ptr, - ptrdiff_t out_pitch, - unsigned int output_height, - const short *filter -); - -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h) { \ - if (step_q4 == 16 && filter[3] != 128) { \ - if (filter[0] || filter[1] || filter[2]) { \ - while (w >= 16) { \ - vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vp9_filter_block1d16_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_filter_block1d8_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_filter_block1d4_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h); \ - } \ -} - -#define FUN_CONV_2D(avg, opt) \ -void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ - filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ - DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 71]); \ - vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h + 7); \ - vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } else { \ - DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 65]); \ - vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h + 1); \ - vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } \ - } else { \ - vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ - } \ -} - -#if CONFIG_VP9_HIGHBITDEPTH - -typedef void highbd_filter8_1dfunction ( - const uint16_t *src_ptr, - const ptrdiff_t src_pitch, - uint16_t *output_ptr, - ptrdiff_t out_pitch, - unsigned int output_height, - const int16_t *filter, - int bd -); - -#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ - ptrdiff_t src_stride, \ - uint8_t *dst8, \ - ptrdiff_t dst_stride, \ - const int16_t *filter_x, \ - int x_step_q4, \ - const int16_t *filter_y, \ - int y_step_q4, \ - int w, int h, int bd) { \ - if (step_q4 == 16 && filter[3] != 128) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - if (filter[0] || filter[1] || filter[2]) { \ - while (w >= 16) { \ - vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ - } \ -} - -#define HIGH_FUN_CONV_2D(avg, opt) \ -void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h, int bd) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ - filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h + 7, bd); \ - vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ - 64, dst, dst_stride, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h, bd); \ - } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vp9_highbd_convolve8_horiz_##opt(src, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h + 1, bd); \ - vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ - dst, dst_stride, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h, bd); \ - } \ - } else { \ - vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, \ - h, bd); \ - } \ -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if HAVE_AVX2 && HAVE_SSSE3 -filter8_1dfunction vp9_filter_block1d16_v8_avx2; -filter8_1dfunction vp9_filter_block1d16_h8_avx2; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -#if ARCH_X86_64 -filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; -#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 -#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 -#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 -filter8_1dfunction vp9_filter_block1d8_v8_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_ssse3; -#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 -#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 -#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 -#endif // ARCH_X86_64 / ARCH_X86 -filter8_1dfunction vp9_filter_block1d16_v2_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_ssse3; -#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 -#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 -#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 -#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 -#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 -#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 -#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 -// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); - -// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, avx2); -#endif // HAVE_AX2 && HAVE_SSSE3 -#if HAVE_SSSE3 -#if ARCH_X86_64 -filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; -#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 -#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 -#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 -#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 -#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 -filter8_1dfunction vp9_filter_block1d16_v8_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_ssse3; -#endif // ARCH_X86_64 / ARCH_X86 -filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; - -filter8_1dfunction vp9_filter_block1d16_v2_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_ssse3; -filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; - -// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3); - -// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, ssse3); -FUN_CONV_2D(avg_ , ssse3); -#endif // HAVE_SSSE3 +#include "./vpx_config.h" +#include "vp9/common/x86/convolve.h" #if HAVE_SSE2 filter8_1dfunction vp9_filter_block1d16_v8_sse2; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c index 1637f0e545a..b40669c6375 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c @@ -11,6 +11,7 @@ #include <emmintrin.h> // SSE2 #include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_loopfilter.h" #include "vpx_ports/emmintrin_compat.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index 0385c7955c9..ce010df3b8a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" #include "vp9/common/x86/vp9_idct_intrin_sse2.h" #include "vp9/common/vp9_idct.h" @@ -3203,34 +3205,20 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, // idct constants for each stage const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); @@ -3240,8 +3228,6 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); @@ -3261,47 +3247,29 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, stp2_30, stp2_31; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i; - // Load input data. - LOAD_DQCOEFF(in[0], input); - LOAD_DQCOEFF(in[8], input); - LOAD_DQCOEFF(in[16], input); - LOAD_DQCOEFF(in[24], input); - LOAD_DQCOEFF(in[1], input); - LOAD_DQCOEFF(in[9], input); - LOAD_DQCOEFF(in[17], input); - LOAD_DQCOEFF(in[25], input); - LOAD_DQCOEFF(in[2], input); - LOAD_DQCOEFF(in[10], input); - LOAD_DQCOEFF(in[18], input); - LOAD_DQCOEFF(in[26], input); - LOAD_DQCOEFF(in[3], input); - LOAD_DQCOEFF(in[11], input); - LOAD_DQCOEFF(in[19], input); - LOAD_DQCOEFF(in[27], input); - - LOAD_DQCOEFF(in[4], input); - LOAD_DQCOEFF(in[12], input); - LOAD_DQCOEFF(in[20], input); - LOAD_DQCOEFF(in[28], input); - LOAD_DQCOEFF(in[5], input); - LOAD_DQCOEFF(in[13], input); - LOAD_DQCOEFF(in[21], input); - LOAD_DQCOEFF(in[29], input); - LOAD_DQCOEFF(in[6], input); - LOAD_DQCOEFF(in[14], input); - LOAD_DQCOEFF(in[22], input); - LOAD_DQCOEFF(in[30], input); - LOAD_DQCOEFF(in[7], input); - LOAD_DQCOEFF(in[15], input); - LOAD_DQCOEFF(in[23], input); - LOAD_DQCOEFF(in[31], input); + + // Load input data. Only need to load the top left 8x8 block. + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 32)); + in[2] = _mm_load_si128((const __m128i *)(input + 64)); + in[3] = _mm_load_si128((const __m128i *)(input + 96)); + in[4] = _mm_load_si128((const __m128i *)(input + 128)); + in[5] = _mm_load_si128((const __m128i *)(input + 160)); + in[6] = _mm_load_si128((const __m128i *)(input + 192)); + in[7] = _mm_load_si128((const __m128i *)(input + 224)); + + for (i = 8; i < 32; ++i) { + in[i] = _mm_setzero_si128(); + } array_transpose_8x8(in, in); + // TODO(hkuang): Following transposes are unnecessary. But remove them will + // lead to performance drop on some devices. array_transpose_8x8(in + 8, in + 8); array_transpose_8x8(in + 16, in + 16); array_transpose_8x8(in + 24, in + 24); - IDCT32 + IDCT32_34 // 1_D: Store 32 intermediate results for each 8x32 block. col[0] = _mm_add_epi16(stp1_0, stp1_31); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c index 0cb0912ad62..770a65f4ca1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c @@ -9,6 +9,8 @@ */ #include <immintrin.h> /* AVX2 */ + +#include "./vp9_rtcd.h" #include "vpx_ports/mem.h" static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index 8723d32836d..e321dbebe39 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -9,6 +9,8 @@ */ #include <emmintrin.h> // SSE2 + +#include "./vp9_rtcd.h" #include "vp9/common/vp9_loopfilter.h" #include "vpx_ports/emmintrin_compat.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c index 3bc7d3918b7..cee8d1e76ac 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c @@ -8,7 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +// Due to a header conflict between math.h and intrinsics includes with ceil() +// in certain configurations under vs9 this include needs to precede +// immintrin.h. +#include "./vp9_rtcd.h" + #include <immintrin.h> + +#include "vp9/common/x86/convolve.h" #include "vpx_ports/mem.h" // filters for 16_h8 and 16_v8 @@ -53,23 +60,23 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) #endif // __clang__ -void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; - unsigned int src_stride, dst_stride; + ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -104,9 +111,9 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, for (i = output_height; i > 1; i-=2) { // load the 2 strides of source srcReg32b1 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr-3))); + _mm_loadu_si128((const __m128i *)(src_ptr - 3))); srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm_loadu_si128((__m128i *) + _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line-3)), 1); // filter the source buffer @@ -135,9 +142,9 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+5))); + _mm_loadu_si128((const __m128i *)(src_ptr + 5))); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm_loadu_si128((__m128i *) + _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line+5)), 1); // add and saturate the results together @@ -202,7 +209,7 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, @@ -237,7 +244,7 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, // reading the next 16 bytes // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, @@ -297,12 +304,12 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, } } -void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; @@ -310,11 +317,11 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; - unsigned int src_stride, dst_stride; + ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -344,19 +351,19 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr))); + _mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, @@ -393,11 +400,11 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); @@ -476,7 +483,7 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes - srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); + srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8( @@ -542,3 +549,54 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); } } + +#if HAVE_AVX2 && HAVE_SSSE3 +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +#if ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 +#else // ARCH_X86 +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 +#endif // ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 +#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 +#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 +#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 +#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 +#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 +#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 +// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2); +#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c index 71dbb402dd4..5fd2857e140 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c @@ -8,7 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +// Due to a header conflict between math.h and intrinsics includes with ceil() +// in certain configurations under vs9 this include needs to precede +// tmmintrin.h. +#include "./vp9_rtcd.h" + #include <tmmintrin.h> + +#include "vp9/common/x86/convolve.h" #include "vpx_ports/mem.h" #include "vpx_ports/emmintrin_compat.h" @@ -38,12 +45,17 @@ DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; -void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +// These are reused by the avx2 intrinsics. +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; + +void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, srcReg, minReg; @@ -51,7 +63,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -72,7 +84,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); @@ -109,12 +121,12 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; @@ -123,7 +135,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -147,7 +159,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); @@ -189,12 +201,12 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; @@ -203,7 +215,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -227,7 +239,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); @@ -254,7 +266,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, // reading the next 16 bytes. // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, @@ -306,12 +318,12 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { +void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i addFilterReg64, filtersReg, minReg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; @@ -321,7 +333,7 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -336,17 +348,17 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 8 bytes - srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); - srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]); - srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]); - srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]); - srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]); - srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]); - srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]); + srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 8 bytes - srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]); + srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); @@ -394,12 +406,12 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, } } -void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { +static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; @@ -409,7 +421,7 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); + filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); @@ -424,17 +436,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 16 bytes - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr)); - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch)); - srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2)); - srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3)); - srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4)); - srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5)); - srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 16 bytes - srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7)); + srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); @@ -508,3 +520,82 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, output_ptr+=out_pitch; } } + +#if ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 +#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 +#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 +#else // ARCH_X86 +filter8_1dfunction vp9_filter_block1d16_v8_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#endif // ARCH_X86_64 +filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; + +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; + +// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + ssse3); + +// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, ssse3); +FUN_CONV_2D(avg_ , ssse3); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c index eb9b7971074..9311d8dad7d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c @@ -15,6 +15,7 @@ #include "./vpx_scale_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vpx_ports/mem_ops.h" #include "vpx_scale/vpx_scale.h" @@ -289,9 +290,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, } struct intra_args { - VP9_COMMON *cm; MACROBLOCKD *xd; - FRAME_COUNTS *counts; vp9_reader *r; int seg_id; }; @@ -300,7 +299,6 @@ static void predict_and_reconstruct_intra_block(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct intra_args *const args = (struct intra_args *)arg; - VP9_COMMON *const cm = args->cm; MACROBLOCKD *const xd = args->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; MODE_INFO *const mi = xd->mi[0]; @@ -317,7 +315,7 @@ static void predict_and_reconstruct_intra_block(int plane, int block, x, y, plane); if (!mi->mbmi.skip) { - const int eob = vp9_decode_block_tokens(cm, xd, args->counts, plane, block, + const int eob = vp9_decode_block_tokens(xd, plane, block, plane_bsize, x, y, tx_size, args->r, args->seg_id); inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride, @@ -326,10 +324,8 @@ static void predict_and_reconstruct_intra_block(int plane, int block, } struct inter_args { - VP9_COMMON *cm; MACROBLOCKD *xd; vp9_reader *r; - FRAME_COUNTS *counts; int *eobtotal; int seg_id; }; @@ -338,12 +334,11 @@ static void reconstruct_inter_block(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct inter_args *args = (struct inter_args *)arg; - VP9_COMMON *const cm = args->cm; MACROBLOCKD *const xd = args->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; int x, y, eob; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); - eob = vp9_decode_block_tokens(cm, xd, args->counts, plane, block, plane_bsize, + eob = vp9_decode_block_tokens(xd, plane, block, plane_bsize, x, y, tx_size, args->r, args->seg_id); inverse_transform_block(xd, plane, block, tx_size, &pd->dst.buf[4 * y * pd->dst.stride + 4 * x], @@ -351,6 +346,357 @@ static void reconstruct_inter_block(int plane, int block, *args->eobtotal += eob; } +static void build_mc_border(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int x, int y, int b_w, int b_h, int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint8_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) + left = b_w; + + if (x + b_w > w) + right = x + b_w - w; + + if (right > b_w) + right = b_w; + + copy = b_w - left - right; + + if (left) + memset(dst, ref_row[0], left); + + if (copy) + memcpy(dst + left, ref_row + x + left, copy); + + if (right) + memset(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) + ref_row += src_stride; + } while (--b_h); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void high_build_mc_border(const uint8_t *src8, int src_stride, + uint16_t *dst, int dst_stride, + int x, int y, int b_w, int b_h, + int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) + left = b_w; + + if (x + b_w > w) + right = x + b_w - w; + + if (right > b_w) + right = b_w; + + copy = b_w - left - right; + + if (left) + vpx_memset16(dst, ref_row[0], left); + + if (copy) + memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t)); + + if (right) + vpx_memset16(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) + ref_row += src_stride; + } while (--b_h); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH +static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride, + int x0, int y0, int b_w, int b_h, + int frame_width, int frame_height, + int border_offset, + uint8_t *const dst, int dst_buf_stride, + int subpel_x, int subpel_y, + const InterpKernel *kernel, + const struct scale_factors *sf, + MACROBLOCKD *xd, + int w, int h, int ref, int xs, int ys) { + DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]); + const uint8_t *buf_ptr; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w, + x0, y0, b_w, b_h, frame_width, frame_height); + buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset; + } else { + build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w, + x0, y0, b_w, b_h, frame_width, frame_height); + buf_ptr = ((uint8_t *)mc_buf_high) + border_offset; + } + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); + } else { + inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); + } +} +#else +static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride, + int x0, int y0, int b_w, int b_h, + int frame_width, int frame_height, + int border_offset, + uint8_t *const dst, int dst_buf_stride, + int subpel_x, int subpel_y, + const InterpKernel *kernel, + const struct scale_factors *sf, + int w, int h, int ref, int xs, int ys) { + DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]); + const uint8_t *buf_ptr; + + build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w, + x0, y0, b_w, b_h, frame_width, frame_height); + buf_ptr = mc_buf + border_offset; + + inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd, + int plane, int bw, int bh, int x, + int y, int w, int h, int mi_x, int mi_y, + const InterpKernel *kernel, + const struct scale_factors *sf, + struct buf_2d *pre_buf, + struct buf_2d *dst_buf, const MV* mv, + RefCntBuffer *ref_frame_buf, + int is_scaled, int ref) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + MV32 scaled_mv; + int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, + buf_stride, subpel_x, subpel_y; + uint8_t *ref_frame, *buf_ptr; + + // Get reference frame pointer, width and height. + if (plane == 0) { + frame_width = ref_frame_buf->buf.y_crop_width; + frame_height = ref_frame_buf->buf.y_crop_height; + ref_frame = ref_frame_buf->buf.y_buffer; + } else { + frame_width = ref_frame_buf->buf.uv_crop_width; + frame_height = ref_frame_buf->buf.uv_crop_height; + ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer + : ref_frame_buf->buf.v_buffer; + } + + if (is_scaled) { + const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh, + pd->subsampling_x, + pd->subsampling_y); + // Co-ordinate of containing block to pixel precision. + int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); + int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); + + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = (x_start + x) << SUBPEL_BITS; + y0_16 = (y_start + y) << SUBPEL_BITS; + + // Co-ordinate of current block in reference frame + // to 1/16th pixel precision. + x0_16 = sf->scale_value_x(x0_16, sf); + y0_16 = sf->scale_value_y(y0_16, sf); + + // Map the top left corner of the block into the reference frame. + x0 = sf->scale_value_x(x_start + x, sf); + y0 = sf->scale_value_y(y_start + y, sf); + + // Scale the MV and incorporate the sub-pixel offset of the block + // in the reference frame. + scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf); + xs = sf->x_step_q4; + ys = sf->y_step_q4; + } else { + // Co-ordinate of containing block to pixel precision. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = x0 << SUBPEL_BITS; + y0_16 = y0 << SUBPEL_BITS; + + scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y)); + scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x)); + xs = ys = 16; + } + subpel_x = scaled_mv.col & SUBPEL_MASK; + subpel_y = scaled_mv.row & SUBPEL_MASK; + + // Calculate the top left corner of the best matching block in the + // reference frame. + x0 += scaled_mv.col >> SUBPEL_BITS; + y0 += scaled_mv.row >> SUBPEL_BITS; + x0_16 += scaled_mv.col; + y0_16 += scaled_mv.row; + + // Get reference block pointer. + buf_ptr = ref_frame + y0 * pre_buf->stride + x0; + buf_stride = pre_buf->stride; + + // Do border extension if there is motion or the + // width/height is not a multiple of 8 pixels. + if (is_scaled || scaled_mv.col || scaled_mv.row || + (frame_width & 0x7) || (frame_height & 0x7)) { + int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS; + + // Get reference block bottom right horizontal coordinate. + int x1 = (x0_16 + (w - 1) * xs) >> SUBPEL_BITS; + int x_pad = 0, y_pad = 0; + + if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) { + x0 -= VP9_INTERP_EXTEND - 1; + x1 += VP9_INTERP_EXTEND; + x_pad = 1; + } + + if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) { + y0 -= VP9_INTERP_EXTEND - 1; + y1 += VP9_INTERP_EXTEND; + y_pad = 1; + } + + // Wait until reference block is ready. Pad 7 more pixels as last 7 + // pixels of each superblock row can be changed by next superblock row. + if (pbi->frame_parallel_decode) + vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf, + MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1)); + + // Skip border extension if block is inside the frame. + if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 || + y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { + // Extend the border. + const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0; + const int b_w = x1 - x0 + 1; + const int b_h = y1 - y0 + 1; + const int border_offset = y_pad * 3 * b_w + x_pad * 3; + + extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h, + frame_width, frame_height, border_offset, + dst, dst_buf->stride, + subpel_x, subpel_y, + kernel, sf, +#if CONFIG_VP9_HIGHBITDEPTH + xd, +#endif + w, h, ref, xs, ys); + return; + } + } else { + // Wait until reference block is ready. Pad 7 more pixels as last 7 + // pixels of each superblock row can be changed by next superblock row. + if (pbi->frame_parallel_decode) { + const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS; + vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf, + MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1)); + } + } +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); + } else { + inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); + } +#else + inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, + MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const MODE_INFO *mi = xd->mi[0]; + const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter); + const BLOCK_SIZE sb_type = mi->mbmi.sb_type; + const int is_compound = has_second_ref(&mi->mbmi); + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, + &xd->plane[plane]); + struct macroblockd_plane *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; + int ref; + + for (ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = &xd->block_refs[ref]->sf; + struct buf_2d *const pre_buf = &pd->pre[ref]; + const int idx = xd->block_refs[ref]->idx; + BufferPool *const pool = pbi->common.buffer_pool; + RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx]; + const int is_scaled = vp9_is_scaled(sf); + + if (sb_type < BLOCK_8X8) { + int i = 0, x, y; + assert(bsize == BLOCK_8X8); + for (y = 0; y < num_4x4_h; ++y) { + for (x = 0; x < num_4x4_w; ++x) { + const MV mv = average_split_mvs(pd, mi, ref, i++); + dec_build_inter_predictors(pbi, xd, plane, bw, bh, + 4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel, + sf, pre_buf, dst_buf, &mv, + ref_frame_buf, is_scaled, ref); + } + } + } else { + const MV mv = mi->mbmi.mv[ref].as_mv; + dec_build_inter_predictors(pbi, xd, plane, bw, bh, + 0, 0, bw, bh, mi_x, mi_y, kernel, + sf, pre_buf, dst_buf, &mv, ref_frame_buf, + is_scaled, ref); + } + } + } +} + static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, const TileInfo *const tile, BLOCK_SIZE bsize, int mi_row, int mi_col) { @@ -380,14 +726,22 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, } static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, - FRAME_COUNTS *counts, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &pbi->common; const int less8x8 = bsize < BLOCK_8X8; MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col); - vp9_read_mode_info(pbi, xd, counts, tile, mi_row, mi_col, r); + + if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + vpx_internal_error(xd->error_info, + VPX_CODEC_CORRUPT_FRAME, "Invalid block size."); + } + + vp9_read_mode_info(pbi, xd, tile, mi_row, mi_col, r); if (less8x8) bsize = BLOCK_8X8; @@ -397,17 +751,17 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, } if (!is_inter_block(mbmi)) { - struct intra_args arg = {cm, xd, counts, r, mbmi->segment_id}; + struct intra_args arg = {xd, r, mbmi->segment_id}; vp9_foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block, &arg); } else { // Prediction - vp9_dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize); + dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize); // Reconstruction if (!mbmi->skip) { int eobtotal = 0; - struct inter_args arg = {cm, xd, r, counts, &eobtotal, mbmi->segment_id}; + struct inter_args arg = {xd, r, &eobtotal, mbmi->segment_id}; vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg); if (!less8x8 && eobtotal == 0) mbmi->skip = 1; // skip loopfilter @@ -417,14 +771,12 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, xd->corrupted |= vp9_reader_has_error(r); } -static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, - FRAME_COUNTS *counts, int hbs, - int mi_row, int mi_col, BLOCK_SIZE bsize, - vp9_reader *r) { +static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize, vp9_reader *r, + int has_rows, int has_cols) { const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); - const vp9_prob *const probs = get_partition_probs(cm, ctx); - const int has_rows = (mi_row + hbs) < cm->mi_rows; - const int has_cols = (mi_col + hbs) < cm->mi_cols; + const vp9_prob *const probs = get_partition_probs(xd, ctx); + FRAME_COUNTS *counts = xd->counts; PARTITION_TYPE p; if (has_rows && has_cols) @@ -436,56 +788,50 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, else p = PARTITION_SPLIT; - if (!cm->frame_parallel_decoding_mode) + if (counts) ++counts->partition[ctx][p]; return p; } static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd, - FRAME_COUNTS *counts, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader* r, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &pbi->common; const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2; PARTITION_TYPE partition; - BLOCK_SIZE subsize, uv_subsize; + BLOCK_SIZE subsize; + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - partition = read_partition(cm, xd, counts, hbs, mi_row, mi_col, bsize, r); + partition = read_partition(xd, mi_row, mi_col, bsize, r, has_rows, has_cols); subsize = get_subsize(bsize, partition); - uv_subsize = ss_size_lookup[subsize][cm->subsampling_x][cm->subsampling_y]; - if (subsize >= BLOCK_8X8 && uv_subsize == BLOCK_INVALID) - vpx_internal_error(xd->error_info, - VPX_CODEC_CORRUPT_FRAME, "Invalid block size."); - if (subsize < BLOCK_8X8) { - decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize); + if (bsize == BLOCK_8X8) { + decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); } else { switch (partition) { case PARTITION_NONE: - decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize); + decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); break; case PARTITION_HORZ: - decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize); - if (mi_row + hbs < cm->mi_rows) - decode_block(pbi, xd, counts, tile, mi_row + hbs, mi_col, r, subsize); + decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); + if (has_rows) + decode_block(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize); break; case PARTITION_VERT: - decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize); - if (mi_col + hbs < cm->mi_cols) - decode_block(pbi, xd, counts, tile, mi_row, mi_col + hbs, r, subsize); + decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); + if (has_cols) + decode_block(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize); break; case PARTITION_SPLIT: - decode_partition(pbi, xd, counts, tile, mi_row, mi_col, r, subsize); - decode_partition(pbi, xd, counts, tile, mi_row, mi_col + hbs, r, - subsize); - decode_partition(pbi, xd, counts, tile, mi_row + hbs, mi_col, r, - subsize); - decode_partition(pbi, xd, counts, tile, mi_row + hbs, mi_col + hbs, r, - subsize); + decode_partition(pbi, xd, tile, mi_row, mi_col, r, subsize); + decode_partition(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize); + decode_partition(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize); + decode_partition(pbi, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize); break; default: assert(0 && "Invalid partition type"); @@ -673,12 +1019,6 @@ static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) { : literal_to_filter[vp9_rb_read_literal(rb, 2)]; } -void vp9_read_frame_size(struct vp9_read_bit_buffer *rb, - int *width, int *height) { - *width = vp9_rb_read_literal(rb, 16) + 1; - *height = vp9_rb_read_literal(rb, 16) + 1; -} - static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { cm->display_width = cm->width; cm->display_height = cm->height; @@ -698,7 +1038,8 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { #if CONFIG_SIZE_LIMIT if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Width and height beyond allowed size."); + "Dimensions of %dx%d beyond allowed size of %dx%d.", + width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); #endif if (cm->width != width || cm->height != height) { const int new_mi_rows = @@ -929,7 +1270,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, int mi_row, mi_col; TileData *tile_data = NULL; - if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) { + if (cm->lf.filter_level && !cm->skip_loop_filter && + pbi->lf_worker.data1 == NULL) { CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_memalign(32, sizeof(LFWorkerData))); pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; @@ -939,7 +1281,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, } } - if (cm->lf.filter_level) { + if (cm->lf.filter_level && !cm->skip_loop_filter) { LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; // Be sure to sync as we might be resuming after a failed frame decode. winterface->sync(&pbi->lf_worker); @@ -979,6 +1321,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, tile_data->cm = cm; tile_data->xd = pbi->mb; tile_data->xd.corrupted = 0; + tile_data->xd.counts = cm->frame_parallel_decoding_mode ? + NULL : &cm->counts; vp9_tile_init(&tile, tile_data->cm, tile_row, tile_col); setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &tile_data->bit_reader, pbi->decrypt_cb, @@ -1001,7 +1345,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += MI_BLOCK_SIZE) { - decode_partition(pbi, &tile_data->xd, &cm->counts, &tile, mi_row, + decode_partition(pbi, &tile_data->xd, &tile, mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64); } pbi->mb.corrupted |= tile_data->xd.corrupted; @@ -1010,7 +1354,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, "Failed to decode tile data"); } // Loopfilter one row. - if (cm->lf.filter_level) { + if (cm->lf.filter_level && !cm->skip_loop_filter) { const int lf_start = mi_row - MI_BLOCK_SIZE; LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; @@ -1039,7 +1383,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, } // Loopfilter remaining rows in the frame. - if (cm->lf.filter_level) { + if (cm->lf.filter_level && !cm->skip_loop_filter) { LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; winterface->sync(&pbi->lf_worker); lf_data->start = lf_data->stop; @@ -1074,7 +1418,7 @@ static int tile_worker_hook(TileWorkerData *const tile_data, vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) { - decode_partition(tile_data->pbi, &tile_data->xd, &tile_data->counts, + decode_partition(tile_data->pbi, &tile_data->xd, tile, mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64); } @@ -1112,8 +1456,6 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, if (pbi->num_tile_workers == 0) { const int num_threads = pbi->max_threads & ~1; int i; - // TODO(jzern): Allocate one less worker, as in the current code we only - // use num_threads - 1 workers. CHECK_MEM_ERROR(cm, pbi->tile_workers, vpx_malloc(num_threads * sizeof(*pbi->tile_workers))); // Ensure tile data offsets will be properly aligned. This may fail on @@ -1198,6 +1540,8 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, tile_data->pbi = pbi; tile_data->xd = pbi->mb; tile_data->xd.corrupted = 0; + tile_data->xd.counts = cm->frame_parallel_decoding_mode ? + 0 : &tile_data->counts; vp9_tile_init(tile, cm, 0, buf->col); setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &tile_data->bit_reader, pbi->decrypt_cb, @@ -1251,20 +1595,6 @@ static void error_handler(void *data) { vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet"); } -int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) { - return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 && - vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 && - vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2; -} - -BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) { - int profile = vp9_rb_read_bit(rb); - profile |= vp9_rb_read_bit(rb) << 1; - if (profile > 2) - profile += vp9_rb_read_bit(rb); - return (BITSTREAM_PROFILE) profile; -} - static void read_bitdepth_colorspace_sampling( VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { if (cm->profile >= PROFILE_2) { @@ -1311,12 +1641,13 @@ static void read_bitdepth_colorspace_sampling( static size_t read_uncompressed_header(VP9Decoder *pbi, struct vp9_read_bit_buffer *rb) { VP9_COMMON *const cm = &pbi->common; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - BufferPool *const pool = pbi->common.buffer_pool; + BufferPool *const pool = cm->buffer_pool; + RefCntBuffer *const frame_bufs = pool->frame_bufs; int i, mask, ref_index = 0; size_t sz; cm->last_frame_type = cm->frame_type; + cm->last_intra_only = cm->intra_only; if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, @@ -1593,12 +1924,12 @@ static void debug_check_frame_counts(const VP9_COMMON *const cm) { } #endif // NDEBUG -static struct vp9_read_bit_buffer* init_read_bit_buffer( +static struct vp9_read_bit_buffer *init_read_bit_buffer( VP9Decoder *pbi, struct vp9_read_bit_buffer *rb, const uint8_t *data, const uint8_t *data_end, - uint8_t *clear_data /* buffer size MAX_VP9_HEADER_SIZE */) { + uint8_t clear_data[MAX_VP9_HEADER_SIZE]) { rb->bit_offset = 0; rb->error_handler = error_handler; rb->error_handler_data = &pbi->common; @@ -1614,12 +1945,34 @@ static struct vp9_read_bit_buffer* init_read_bit_buffer( return rb; } +//------------------------------------------------------------------------------ + +int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) { + return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 && + vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 && + vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2; +} + +void vp9_read_frame_size(struct vp9_read_bit_buffer *rb, + int *width, int *height) { + *width = vp9_rb_read_literal(rb, 16) + 1; + *height = vp9_rb_read_literal(rb, 16) + 1; +} + +BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) { + int profile = vp9_rb_read_bit(rb); + profile |= vp9_rb_read_bit(rb) << 1; + if (profile > 2) + profile += vp9_rb_read_bit(rb); + return (BITSTREAM_PROFILE) profile; +} + void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0}; + struct vp9_read_bit_buffer rb; int context_updated = 0; uint8_t clear_data[MAX_VP9_HEADER_SIZE]; const size_t first_partition_size = read_uncompressed_header(pbi, @@ -1643,8 +1996,9 @@ void vp9_decode_frame(VP9Decoder *pbi, cm->use_prev_frame_mvs = !cm->error_resilient_mode && cm->width == cm->last_width && cm->height == cm->last_height && - !cm->intra_only && - cm->last_show_frame; + !cm->last_intra_only && + cm->last_show_frame && + (cm->last_frame_type != KEY_FRAME); vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y); @@ -1661,7 +2015,7 @@ void vp9_decode_frame(VP9Decoder *pbi, vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Decode failed. Frame data header is corrupted."); - if (cm->lf.filter_level) { + if (cm->lf.filter_level && !cm->skip_loop_filter) { vp9_loop_filter_frame_init(cm, cm->lf.filter_level); } @@ -1687,11 +2041,13 @@ void vp9_decode_frame(VP9Decoder *pbi, // Multi-threaded tile decoder *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); if (!xd->corrupted) { - // If multiple threads are used to decode tiles, then we use those threads - // to do parallel loopfiltering. - vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level, - 0, 0, pbi->tile_workers, pbi->num_tile_workers, - &pbi->lf_row_sync); + if (!cm->skip_loop_filter) { + // If multiple threads are used to decode tiles, then we use those + // threads to do parallel loopfiltering. + vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, + cm->lf.filter_level, 0, 0, pbi->tile_workers, + pbi->num_tile_workers, &pbi->lf_row_sync); + } } else { vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Decode failed. Frame data is corrupted."); @@ -1721,327 +2077,3 @@ void vp9_decode_frame(VP9Decoder *pbi, if (cm->refresh_frame_context && !context_updated) cm->frame_contexts[cm->frame_context_idx] = *cm->fc; } - -static void build_mc_border(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - int x, int y, int b_w, int b_h, int w, int h) { - // Get a pointer to the start of the real data for this row. - const uint8_t *ref_row = src - x - y * src_stride; - - if (y >= h) - ref_row += (h - 1) * src_stride; - else if (y > 0) - ref_row += y * src_stride; - - do { - int right = 0, copy; - int left = x < 0 ? -x : 0; - - if (left > b_w) - left = b_w; - - if (x + b_w > w) - right = x + b_w - w; - - if (right > b_w) - right = b_w; - - copy = b_w - left - right; - - if (left) - memset(dst, ref_row[0], left); - - if (copy) - memcpy(dst + left, ref_row + x + left, copy); - - if (right) - memset(dst + left + copy, ref_row[w - 1], right); - - dst += dst_stride; - ++y; - - if (y > 0 && y < h) - ref_row += src_stride; - } while (--b_h); -} - -#if CONFIG_VP9_HIGHBITDEPTH -static void high_build_mc_border(const uint8_t *src8, int src_stride, - uint16_t *dst, int dst_stride, - int x, int y, int b_w, int b_h, - int w, int h) { - // Get a pointer to the start of the real data for this row. - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - const uint16_t *ref_row = src - x - y * src_stride; - - if (y >= h) - ref_row += (h - 1) * src_stride; - else if (y > 0) - ref_row += y * src_stride; - - do { - int right = 0, copy; - int left = x < 0 ? -x : 0; - - if (left > b_w) - left = b_w; - - if (x + b_w > w) - right = x + b_w - w; - - if (right > b_w) - right = b_w; - - copy = b_w - left - right; - - if (left) - vpx_memset16(dst, ref_row[0], left); - - if (copy) - memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t)); - - if (right) - vpx_memset16(dst + left + copy, ref_row[w - 1], right); - - dst += dst_stride; - ++y; - - if (y > 0 && y < h) - ref_row += src_stride; - } while (--b_h); -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd, - int plane, int bw, int bh, int x, - int y, int w, int h, int mi_x, int mi_y, - const InterpKernel *kernel, - const struct scale_factors *sf, - struct buf_2d *pre_buf, struct buf_2d *dst_buf, - const MV* mv, RefCntBuffer *ref_frame_buf, - int is_scaled, int ref) { - struct macroblockd_plane *const pd = &xd->plane[plane]; - uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; - MV32 scaled_mv; - int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, - buf_stride, subpel_x, subpel_y; - uint8_t *ref_frame, *buf_ptr; - - // Get reference frame pointer, width and height. - if (plane == 0) { - frame_width = ref_frame_buf->buf.y_crop_width; - frame_height = ref_frame_buf->buf.y_crop_height; - ref_frame = ref_frame_buf->buf.y_buffer; - } else { - frame_width = ref_frame_buf->buf.uv_crop_width; - frame_height = ref_frame_buf->buf.uv_crop_height; - ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer - : ref_frame_buf->buf.v_buffer; - } - - if (is_scaled) { - const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh, - pd->subsampling_x, - pd->subsampling_y); - // Co-ordinate of containing block to pixel precision. - int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); - int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); - - // Co-ordinate of the block to 1/16th pixel precision. - x0_16 = (x_start + x) << SUBPEL_BITS; - y0_16 = (y_start + y) << SUBPEL_BITS; - - // Co-ordinate of current block in reference frame - // to 1/16th pixel precision. - x0_16 = sf->scale_value_x(x0_16, sf); - y0_16 = sf->scale_value_y(y0_16, sf); - - // Map the top left corner of the block into the reference frame. - x0 = sf->scale_value_x(x_start + x, sf); - y0 = sf->scale_value_y(y_start + y, sf); - - // Scale the MV and incorporate the sub-pixel offset of the block - // in the reference frame. - scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf); - xs = sf->x_step_q4; - ys = sf->y_step_q4; - } else { - // Co-ordinate of containing block to pixel precision. - x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; - y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; - - // Co-ordinate of the block to 1/16th pixel precision. - x0_16 = x0 << SUBPEL_BITS; - y0_16 = y0 << SUBPEL_BITS; - - scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y)); - scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x)); - xs = ys = 16; - } - subpel_x = scaled_mv.col & SUBPEL_MASK; - subpel_y = scaled_mv.row & SUBPEL_MASK; - - // Calculate the top left corner of the best matching block in the - // reference frame. - x0 += scaled_mv.col >> SUBPEL_BITS; - y0 += scaled_mv.row >> SUBPEL_BITS; - x0_16 += scaled_mv.col; - y0_16 += scaled_mv.row; - - // Get reference block pointer. - buf_ptr = ref_frame + y0 * pre_buf->stride + x0; - buf_stride = pre_buf->stride; - - // Do border extension if there is motion or the - // width/height is not a multiple of 8 pixels. - if (is_scaled || scaled_mv.col || scaled_mv.row || - (frame_width & 0x7) || (frame_height & 0x7)) { - int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS; - - // Get reference block bottom right horizontal coordinate. - int x1 = (x0_16 + (w - 1) * xs) >> SUBPEL_BITS; - int x_pad = 0, y_pad = 0; - - if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) { - x0 -= VP9_INTERP_EXTEND - 1; - x1 += VP9_INTERP_EXTEND; - x_pad = 1; - } - - if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) { - y0 -= VP9_INTERP_EXTEND - 1; - y1 += VP9_INTERP_EXTEND; - y_pad = 1; - } - - // Wait until reference block is ready. Pad 7 more pixels as last 7 - // pixels of each superblock row can be changed by next superblock row. - if (pbi->frame_parallel_decode) - vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf, - MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1)); - - // Skip border extension if block is inside the frame. - if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 || - y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { - uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0; - // Extend the border. -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - high_build_mc_border(buf_ptr1, - pre_buf->stride, - xd->mc_buf_high, - x1 - x0 + 1, - x0, - y0, - x1 - x0 + 1, - y1 - y0 + 1, - frame_width, - frame_height); - buf_stride = x1 - x0 + 1; - buf_ptr = CONVERT_TO_BYTEPTR(xd->mc_buf_high) + - y_pad * 3 * buf_stride + x_pad * 3; - } else { - build_mc_border(buf_ptr1, - pre_buf->stride, - xd->mc_buf, - x1 - x0 + 1, - x0, - y0, - x1 - x0 + 1, - y1 - y0 + 1, - frame_width, - frame_height); - buf_stride = x1 - x0 + 1; - buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3; - } -#else - build_mc_border(buf_ptr1, - pre_buf->stride, - xd->mc_buf, - x1 - x0 + 1, - x0, - y0, - x1 - x0 + 1, - y1 - y0 + 1, - frame_width, - frame_height); - buf_stride = x1 - x0 + 1; - buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3; -#endif // CONFIG_VP9_HIGHBITDEPTH - } - } else { - // Wait until reference block is ready. Pad 7 more pixels as last 7 - // pixels of each superblock row can be changed by next superblock row. - if (pbi->frame_parallel_decode) { - const int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; - vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf, - MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1)); - } - } -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, - subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); - } else { - inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, - subpel_y, sf, w, h, ref, kernel, xs, ys); - } -#else - inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, - subpel_y, sf, w, h, ref, kernel, xs, ys); -#endif // CONFIG_VP9_HIGHBITDEPTH -} - -void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd, - int mi_row, int mi_col, - BLOCK_SIZE bsize) { - int plane; - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - const MODE_INFO *mi = xd->mi[0]; - const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter); - const BLOCK_SIZE sb_type = mi->mbmi.sb_type; - const int is_compound = has_second_ref(&mi->mbmi); - - for (plane = 0; plane < MAX_MB_PLANE; ++plane) { - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, - &xd->plane[plane]); - struct macroblockd_plane *const pd = &xd->plane[plane]; - struct buf_2d *const dst_buf = &pd->dst; - const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; - - const int bw = 4 * num_4x4_w; - const int bh = 4 * num_4x4_h; - int ref; - - for (ref = 0; ref < 1 + is_compound; ++ref) { - const struct scale_factors *const sf = &xd->block_refs[ref]->sf; - struct buf_2d *const pre_buf = &pd->pre[ref]; - const int idx = xd->block_refs[ref]->idx; - BufferPool *const pool = pbi->common.buffer_pool; - RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx]; - const int is_scaled = vp9_is_scaled(sf); - - if (sb_type < BLOCK_8X8) { - int i = 0, x, y; - assert(bsize == BLOCK_8X8); - for (y = 0; y < num_4x4_h; ++y) { - for (x = 0; x < num_4x4_w; ++x) { - const MV mv = average_split_mvs(pd, mi, ref, i++); - dec_build_inter_predictors(pbi, xd, plane, bw, bh, - 4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel, - sf, pre_buf, dst_buf, &mv, - ref_frame_buf, is_scaled, ref); - } - } - } else { - const MV mv = mi->mbmi.mv[ref].as_mv; - dec_build_inter_predictors(pbi, xd, plane, bw, bh, - 0, 0, bw, bh, mi_x, mi_y, kernel, - sf, pre_buf, dst_buf, &mv, ref_frame_buf, - is_scaled, ref); - } - } - } -} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h index 8410c541e45..a876e7c60f1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h @@ -16,24 +16,18 @@ extern "C" { #endif -struct VP9Common; struct VP9Decoder; struct vp9_read_bit_buffer; -void vp9_init_dequantizer(struct VP9Common *cm); - -void vp9_decode_frame(struct VP9Decoder *pbi, - const uint8_t *data, const uint8_t *data_end, - const uint8_t **p_data_end); - int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb); void vp9_read_frame_size(struct vp9_read_bit_buffer *rb, int *width, int *height); BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb); -void vp9_dec_build_inter_predictors_sb(struct VP9Decoder *const pbi, - MACROBLOCKD *xd, int mi_row, int mi_col, - BLOCK_SIZE bsize); +void vp9_decode_frame(struct VP9Decoder *pbi, + const uint8_t *data, const uint8_t *data_end, + const uint8_t **p_data_end); + #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c index ce6ff997778..8a8d8ddd8e6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c @@ -27,30 +27,33 @@ static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) { return (PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p); } -static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, FRAME_COUNTS *counts, +static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, MACROBLOCKD *xd, vp9_reader *r, int size_group) { const PREDICTION_MODE y_mode = read_intra_mode(r, cm->fc->y_mode_prob[size_group]); - if (!cm->frame_parallel_decoding_mode) + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->y_mode[size_group][y_mode]; return y_mode; } -static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, FRAME_COUNTS *counts, +static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, MACROBLOCKD *xd, vp9_reader *r, PREDICTION_MODE y_mode) { const PREDICTION_MODE uv_mode = read_intra_mode(r, cm->fc->uv_mode_prob[y_mode]); - if (!cm->frame_parallel_decoding_mode) + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->uv_mode[y_mode][uv_mode]; return uv_mode; } -static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, FRAME_COUNTS *counts, +static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, MACROBLOCKD *xd, vp9_reader *r, int ctx) { const int mode = vp9_read_tree(r, vp9_inter_mode_tree, cm->fc->inter_mode_probs[ctx]); - if (!cm->frame_parallel_decoding_mode) + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->inter_mode[ctx][mode]; return NEARESTMV + mode; @@ -61,8 +64,8 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) { } static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, - FRAME_COUNTS *counts, TX_SIZE max_tx_size, vp9_reader *r) { + FRAME_COUNTS *counts = xd->counts; const int ctx = vp9_get_tx_size_context(xd); const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs); int tx_size = vp9_read(r, tx_probs[0]); @@ -72,19 +75,18 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, tx_size += vp9_read(r, tx_probs[2]); } - if (!cm->frame_parallel_decoding_mode) + if (counts) ++get_tx_counts(max_tx_size, ctx, &counts->tx)[tx_size]; return (TX_SIZE)tx_size; } static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, - FRAME_COUNTS *counts, int allow_select, vp9_reader *r) { TX_MODE tx_mode = cm->tx_mode; BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) - return read_selected_tx_size(cm, xd, counts, max_tx_size, r); + return read_selected_tx_size(cm, xd, max_tx_size, r); else return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]); } @@ -174,14 +176,14 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, } static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd, - FRAME_COUNTS *counts, int segment_id, vp9_reader *r) { - if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { const int ctx = vp9_get_skip_context(xd); const int skip = vp9_read(r, cm->fc->skip_probs[ctx]); - if (!cm->frame_parallel_decoding_mode) + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->skip[ctx][skip]; return skip; } @@ -189,7 +191,6 @@ static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd, static void read_intra_frame_mode_info(VP9_COMMON *const cm, MACROBLOCKD *const xd, - FRAME_COUNTS *counts, int mi_row, int mi_col, vp9_reader *r) { MODE_INFO *const mi = xd->mi[0]; MB_MODE_INFO *const mbmi = &mi->mbmi; @@ -199,8 +200,8 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm, int i; mbmi->segment_id = read_intra_segment_id(cm, bsize, mi_row, mi_col, r); - mbmi->skip = read_skip(cm, xd, counts, mbmi->segment_id, r); - mbmi->tx_size = read_tx_size(cm, xd, counts, 1, r); + mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r); + mbmi->tx_size = read_tx_size(cm, xd, 1, r); mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE; @@ -285,13 +286,13 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref, static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm, const MACROBLOCKD *xd, - FRAME_COUNTS *counts, vp9_reader *r) { if (cm->reference_mode == REFERENCE_MODE_SELECT) { const int ctx = vp9_get_reference_mode_context(cm, xd); const REFERENCE_MODE mode = (REFERENCE_MODE)vp9_read(r, cm->fc->comp_inter_prob[ctx]); - if (!cm->frame_parallel_decoding_mode) + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->comp_inter[ctx][mode]; return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE } else { @@ -301,34 +302,35 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm, // Read the referncence frame static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, - FRAME_COUNTS *counts, vp9_reader *r, + vp9_reader *r, int segment_id, MV_REFERENCE_FRAME ref_frame[2]) { FRAME_CONTEXT *const fc = cm->fc; + FRAME_COUNTS *counts = xd->counts; - if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { - ref_frame[0] = (MV_REFERENCE_FRAME)vp9_get_segdata(&cm->seg, segment_id, - SEG_LVL_REF_FRAME); + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id, + SEG_LVL_REF_FRAME); ref_frame[1] = NONE; } else { - const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, counts, r); + const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding if (mode == COMPOUND_REFERENCE) { const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd); const int bit = vp9_read(r, fc->comp_ref_prob[ctx]); - if (!cm->frame_parallel_decoding_mode) + if (counts) ++counts->comp_ref[ctx][bit]; ref_frame[idx] = cm->comp_fixed_ref; ref_frame[!idx] = cm->comp_var_ref[bit]; } else if (mode == SINGLE_REFERENCE) { const int ctx0 = vp9_get_pred_context_single_ref_p1(xd); const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]); - if (!cm->frame_parallel_decoding_mode) + if (counts) ++counts->single_ref[ctx0][0][bit0]; if (bit0) { const int ctx1 = vp9_get_pred_context_single_ref_p2(xd); const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]); - if (!cm->frame_parallel_decoding_mode) + if (counts) ++counts->single_ref[ctx1][1][bit1]; ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME; } else { @@ -345,18 +347,19 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, static INLINE INTERP_FILTER read_switchable_interp_filter( VP9_COMMON *const cm, MACROBLOCKD *const xd, - FRAME_COUNTS *counts, vp9_reader *r) { + vp9_reader *r) { const int ctx = vp9_get_pred_context_switchable_interp(xd); const INTERP_FILTER type = (INTERP_FILTER)vp9_read_tree(r, vp9_switchable_interp_tree, cm->fc->switchable_interp_prob[ctx]); - if (!cm->frame_parallel_decoding_mode) + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->switchable_interp[ctx][type]; return type; } static void read_intra_block_mode_info(VP9_COMMON *const cm, - FRAME_COUNTS *counts, MODE_INFO *mi, + MACROBLOCKD *const xd, MODE_INFO *mi, vp9_reader *r) { MB_MODE_INFO *const mbmi = &mi->mbmi; const BLOCK_SIZE bsize = mi->mbmi.sb_type; @@ -368,26 +371,26 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, switch (bsize) { case BLOCK_4X4: for (i = 0; i < 4; ++i) - mi->bmi[i].as_mode = read_intra_mode_y(cm, counts, r, 0); + mi->bmi[i].as_mode = read_intra_mode_y(cm, xd, r, 0); mbmi->mode = mi->bmi[3].as_mode; break; case BLOCK_4X8: - mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, counts, + mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, xd, r, 0); mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode = - read_intra_mode_y(cm, counts, r, 0); + read_intra_mode_y(cm, xd, r, 0); break; case BLOCK_8X4: - mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, counts, + mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, xd, r, 0); mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode = - read_intra_mode_y(cm, counts, r, 0); + read_intra_mode_y(cm, xd, r, 0); break; default: - mbmi->mode = read_intra_mode_y(cm, counts, r, size_group_lookup[bsize]); + mbmi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]); } - mbmi->uv_mode = read_intra_mode_uv(cm, counts, r, mbmi->mode); + mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode); } static INLINE int is_mv_valid(const MV *mv) { @@ -395,7 +398,7 @@ static INLINE int is_mv_valid(const MV *mv) { mv->col > MV_LOW && mv->col < MV_UPP; } -static INLINE int assign_mv(VP9_COMMON *cm, FRAME_COUNTS *counts, +static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd, PREDICTION_MODE mode, int_mv mv[2], int_mv ref_mv[2], int_mv nearest_mv[2], int_mv near_mv[2], @@ -405,8 +408,8 @@ static INLINE int assign_mv(VP9_COMMON *cm, FRAME_COUNTS *counts, switch (mode) { case NEWMV: { - nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ? - NULL : &counts->mv; + FRAME_COUNTS *counts = xd->counts; + nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL; for (i = 0; i < 1 + is_compound; ++i) { read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts, allow_hp); @@ -440,15 +443,14 @@ static INLINE int assign_mv(VP9_COMMON *cm, FRAME_COUNTS *counts, } static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, - FRAME_COUNTS *counts, int segment_id, vp9_reader *r) { - if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { - return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != - INTRA_FRAME; + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME; } else { const int ctx = vp9_get_intra_inter_context(xd); const int is_inter = vp9_read(r, cm->fc->intra_inter_prob[ctx]); - if (!cm->frame_parallel_decoding_mode) + FRAME_COUNTS *counts = xd->counts; + if (counts) ++counts->intra_inter[ctx][is_inter]; return is_inter; } @@ -462,7 +464,6 @@ static void fpm_sync(void *const data, int mi_row) { static void read_inter_block_mode_info(VP9Decoder *const pbi, MACROBLOCKD *const xd, - FRAME_COUNTS *counts, const TileInfo *const tile, MODE_INFO *const mi, int mi_row, int mi_col, vp9_reader *r) { @@ -471,9 +472,11 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, const BLOCK_SIZE bsize = mbmi->sb_type; const int allow_hp = cm->allow_high_precision_mv; int_mv nearestmv[2], nearmv[2]; - int inter_mode_ctx, ref, is_compound; + int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; + int ref, is_compound; + uint8_t inter_mode_ctx[MAX_REF_FRAMES]; - read_ref_frames(cm, xd, counts, r, mbmi->segment_id, mbmi->ref_frame); + read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame); is_compound = has_second_ref(mbmi); for (ref = 0; ref < 1 + is_compound; ++ref) { @@ -485,13 +488,11 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, "Reference frame has invalid dimensions"); vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf); - vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame], - mi_row, mi_col, fpm_sync, (void *)pbi); + vp9_find_mv_refs(cm, xd, tile, mi, frame, ref_mvs[frame], + mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx); } - inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]]; - - if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { mbmi->mode = ZEROMV; if (bsize < BLOCK_8X8) { vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, @@ -500,18 +501,19 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, } } else { if (bsize >= BLOCK_8X8) - mbmi->mode = read_inter_mode(cm, counts, r, inter_mode_ctx); + mbmi->mode = read_inter_mode(cm, xd, r, + inter_mode_ctx[mbmi->ref_frame[0]]); } if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) { for (ref = 0; ref < 1 + is_compound; ++ref) { - vp9_find_best_ref_mvs(xd, allow_hp, mbmi->ref_mvs[mbmi->ref_frame[ref]], + vp9_find_best_ref_mvs(xd, allow_hp, ref_mvs[mbmi->ref_frame[ref]], &nearestmv[ref], &nearmv[ref]); } } mbmi->interp_filter = (cm->interp_filter == SWITCHABLE) - ? read_switchable_interp_filter(cm, xd, counts, r) + ? read_switchable_interp_filter(cm, xd, r) : cm->interp_filter; if (bsize < BLOCK_8X8) { @@ -524,15 +526,18 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, for (idx = 0; idx < 2; idx += num_4x4_w) { int_mv block[2]; const int j = idy * 2 + idx; - b_mode = read_inter_mode(cm, counts, r, inter_mode_ctx); + b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx[mbmi->ref_frame[0]]); - if (b_mode == NEARESTMV || b_mode == NEARMV) + if (b_mode == NEARESTMV || b_mode == NEARMV) { + uint8_t dummy_mode_ctx[MAX_REF_FRAMES]; for (ref = 0; ref < 1 + is_compound; ++ref) vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col, &nearest_sub8x8[ref], - &near_sub8x8[ref]); + &near_sub8x8[ref], + dummy_mode_ctx); + } - if (!assign_mv(cm, counts, b_mode, block, nearestmv, + if (!assign_mv(cm, xd, b_mode, block, nearestmv, nearest_sub8x8, near_sub8x8, is_compound, allow_hp, r)) { xd->corrupted |= 1; @@ -555,14 +560,13 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; } else { - xd->corrupted |= !assign_mv(cm, counts, mbmi->mode, mbmi->mv, nearestmv, + xd->corrupted |= !assign_mv(cm, xd, mbmi->mode, mbmi->mv, nearestmv, nearestmv, nearmv, is_compound, allow_hp, r); } } static void read_inter_frame_mode_info(VP9Decoder *const pbi, MACROBLOCKD *const xd, - FRAME_COUNTS *counts, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; @@ -573,18 +577,17 @@ static void read_inter_frame_mode_info(VP9Decoder *const pbi, mbmi->mv[0].as_int = 0; mbmi->mv[1].as_int = 0; mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r); - mbmi->skip = read_skip(cm, xd, counts, mbmi->segment_id, r); - inter_block = read_is_inter_block(cm, xd, counts, mbmi->segment_id, r); - mbmi->tx_size = read_tx_size(cm, xd, counts, !mbmi->skip || !inter_block, r); + mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r); + inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r); + mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r); if (inter_block) - read_inter_block_mode_info(pbi, xd, counts, tile, mi, mi_row, mi_col, r); + read_inter_block_mode_info(pbi, xd, tile, mi, mi_row, mi_col, r); else - read_intra_block_mode_info(cm, counts, mi, r); + read_intra_block_mode_info(cm, xd, mi, r); } void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, - FRAME_COUNTS *counts, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; @@ -596,19 +599,20 @@ void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col; int w, h; - if (frame_is_intra_only(cm)) - read_intra_frame_mode_info(cm, xd, counts, mi_row, mi_col, r); - else - read_inter_frame_mode_info(pbi, xd, counts, tile, mi_row, mi_col, r); - - for (h = 0; h < y_mis; ++h) { - MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols; - for (w = 0; w < x_mis; ++w) { - MV_REF *const mv = frame_mv + w; - mv->ref_frame[0] = mi->mbmi.ref_frame[0]; - mv->ref_frame[1] = mi->mbmi.ref_frame[1]; - mv->mv[0].as_int = mi->mbmi.mv[0].as_int; - mv->mv[1].as_int = mi->mbmi.mv[1].as_int; + if (frame_is_intra_only(cm)) { + read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r); + } else { + read_inter_frame_mode_info(pbi, xd, tile, mi_row, mi_col, r); + + for (h = 0; h < y_mis; ++h) { + MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols; + for (w = 0; w < x_mis; ++w) { + MV_REF *const mv = frame_mv + w; + mv->ref_frame[0] = mi->mbmi.ref_frame[0]; + mv->ref_frame[1] = mi->mbmi.ref_frame[1]; + mv->mv[0].as_int = mi->mbmi.mv[0].as_int; + mv->mv[1].as_int = mi->mbmi.mv[1].as_int; + } } } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h index c79dff71888..dd97d8da030 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h @@ -21,7 +21,6 @@ extern "C" { struct TileInfo; void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, - FRAME_COUNTS *counts, const struct TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c index cf1f23fb98f..7991a39e610 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c @@ -211,6 +211,9 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, // Find an empty frame buffer. const int free_fb = get_free_fb(cm); + if (cm->new_fb_idx == INVALID_IDX) + return VPX_CODEC_MEM_ERROR; + // Decrease ref_count since it will be increased again in // ref_cnt_fb() below. --frame_bufs[free_fb].ref_count; @@ -298,7 +301,10 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, && frame_bufs[cm->new_fb_idx].ref_count == 0) pool->release_fb_cb(pool->cb_priv, &frame_bufs[cm->new_fb_idx].raw_frame_buffer); + // Find a free frame buffer. Return error if can not find any. cm->new_fb_idx = get_free_fb(cm); + if (cm->new_fb_idx == INVALID_IDX) + return VPX_CODEC_MEM_ERROR; // Assign a MV array to the frame buffer. cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c index bb8c66fc09f..3304e64b2d5 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c @@ -17,6 +17,7 @@ #if CONFIG_COEFFICIENT_RANGE_CHECKING #include "vp9/common/vp9_idct.h" #endif +#include "vp9/common/vp9_scan.h" #include "vp9/decoder/vp9_detokenize.h" @@ -34,7 +35,7 @@ #define INCREMENT_COUNT(token) \ do { \ - if (!cm->frame_parallel_decoding_mode) \ + if (counts) \ ++coef_counts[band][ctx][token]; \ } while (0) @@ -45,22 +46,21 @@ static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) { return val; } -static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, - FRAME_COUNTS *counts, PLANE_TYPE type, +static int decode_coefs(const MACROBLOCKD *xd, + PLANE_TYPE type, tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq, int ctx, const int16_t *scan, const int16_t *nb, vp9_reader *r) { + FRAME_COUNTS *counts = xd->counts; const int max_eob = 16 << (tx_size << 1); - const FRAME_CONTEXT *const fc = cm->fc; + const FRAME_CONTEXT *const fc = xd->fc; const int ref = is_inter_block(&xd->mi[0]->mbmi); int band, c = 0; const vp9_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = fc->coef_probs[tx_size][type][ref]; const vp9_prob *prob; - unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1] = - counts->coef[tx_size][type][ref]; - unsigned int (*eob_branch_count)[COEFF_CONTEXTS] = - counts->eob_branch[tx_size][type][ref]; + unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1]; + unsigned int (*eob_branch_count)[COEFF_CONTEXTS]; uint8_t token_cache[32 * 32]; const uint8_t *band_translate = get_band_translate(tx_size); const int dq_shift = (tx_size == TX_32X32); @@ -73,9 +73,14 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, const uint8_t *cat5_prob; const uint8_t *cat6_prob; + if (counts) { + coef_counts = counts->coef[tx_size][type][ref]; + eob_branch_count = counts->eob_branch[tx_size][type][ref]; + } + #if CONFIG_VP9_HIGHBITDEPTH - if (cm->use_highbitdepth) { - if (cm->bit_depth == VPX_BITS_10) { + if (xd->bd > VPX_BITS_8) { + if (xd->bd == VPX_BITS_10) { cat1_prob = vp9_cat1_prob_high10; cat2_prob = vp9_cat2_prob_high10; cat3_prob = vp9_cat3_prob_high10; @@ -111,7 +116,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, int val = -1; band = *band_translate++; prob = coef_probs[band][ctx]; - if (!cm->frame_parallel_decoding_mode) + if (counts) ++eob_branch_count[band][ctx]; if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) { INCREMENT_COUNT(EOB_MODEL_TOKEN); @@ -161,7 +166,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, break; case CATEGORY6_TOKEN: #if CONFIG_VP9_HIGHBITDEPTH - switch (cm->bit_depth) { + switch (xd->bd) { case VPX_BITS_8: val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r); break; @@ -185,7 +190,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, #if CONFIG_COEFFICIENT_RANGE_CHECKING #if CONFIG_VP9_HIGHBITDEPTH dqcoeff[scan[c]] = highbd_check_range((vp9_read_bit(r) ? -v : v), - cm->bit_depth); + xd->bd); #else dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -201,18 +206,17 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, return c; } -int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, - FRAME_COUNTS *counts, int plane, int block, +int vp9_decode_block_tokens(MACROBLOCKD *xd, + int plane, int block, BLOCK_SIZE plane_bsize, int x, int y, TX_SIZE tx_size, vp9_reader *r, int seg_id) { struct macroblockd_plane *const pd = &xd->plane[plane]; - const int16_t *const dequant = (plane == 0) ? cm->y_dequant[seg_id] - : cm->uv_dequant[seg_id]; + const int16_t *const dequant = pd->seg_dequant[seg_id]; const int ctx = get_entropy_context(tx_size, pd->above_context + x, pd->left_context + y); const scan_order *so = get_scan(xd, tx_size, pd->plane_type, block); - const int eob = decode_coefs(cm, xd, counts, pd->plane_type, + const int eob = decode_coefs(xd, pd->plane_type, BLOCK_OFFSET(pd->dqcoeff, block), tx_size, dequant, ctx, so->scan, so->neighbors, r); vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h index 86126b6a19e..df176066898 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h @@ -19,8 +19,8 @@ extern "C" { #endif -int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, - FRAME_COUNTS *counts, int plane, int block, +int vp9_decode_block_tokens(MACROBLOCKD *xd, + int plane, int block, BLOCK_SIZE plane_bsize, int x, int y, TX_SIZE tx_size, vp9_reader *r, int seg_id); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c index cf82dd75d92..0ac194e92b0 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -10,91 +10,24 @@ #include <arm_neon.h> #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" -#include "vp9/common/vp9_common.h" #include "vp9/common/vp9_filter.h" -#include "vp9/encoder/vp9_variance.h" - -static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { - const int32x4_t a = vpaddlq_s16(v_16x8); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { - const int64x2_t b = vpaddlq_s32(v_32x4); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -// w * h must be less than 2048 or local variable v_sum may overflow. -static void variance_neon_w8(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, uint32_t *sse, int *sum) { - int i, j; - int16x8_t v_sum = vdupq_n_s16(0); - int32x4_t v_sse_lo = vdupq_n_s32(0); - int32x4_t v_sse_hi = vdupq_n_s32(0); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const uint8x8_t v_a = vld1_u8(&a[j]); - const uint8x8_t v_b = vld1_u8(&b[j]); - const uint16x8_t v_diff = vsubl_u8(v_a, v_b); - const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); - v_sum = vaddq_s16(v_sum, sv_diff); - v_sse_lo = vmlal_s16(v_sse_lo, - vget_low_s16(sv_diff), - vget_low_s16(sv_diff)); - v_sse_hi = vmlal_s16(v_sse_hi, - vget_high_s16(sv_diff), - vget_high_s16(sv_diff)); - } - a += a_stride; - b += b_stride; - } - - *sum = horizontal_add_s16x8(v_sum); - *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); -} - -void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8, - 8, sse, sum); -} - -unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 -} - -void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16, - 16, sse, sum); -} - -unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 -} +static uint8_t bilinear_filters[8][2] = { + { 128, 0, }, + { 112, 16, }, + { 96, 32, }, + { 80, 48, }, + { 64, 64, }, + { 48, 80, }, + { 32, 96, }, + { 16, 112, }, +}; static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, @@ -102,9 +35,9 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { - const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); - const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + const uint8_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]); unsigned int i; for (i = 0; i < output_height; ++i) { const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); @@ -125,9 +58,9 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { - const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); - const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + const uint8_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]); unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 16) { @@ -159,10 +92,10 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src, var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, - BILINEAR_FILTERS_2TAP(xoffset)); + bilinear_filters[xoffset]); var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, - 8, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse); + 8, bilinear_filters[yoffset]); + return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); } unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, @@ -177,80 +110,10 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, - BILINEAR_FILTERS_2TAP(xoffset)); + bilinear_filters[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, - 16, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse); -} - -void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32, - 32, sse, sum); -} - -unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 -} - -unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); - variance_neon_w8(a + (32 * a_stride), a_stride, - b + (32 * b_stride), b_stride, 32, 32, - &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 -} - -unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, - b + (16 * b_stride), b_stride, 64, 16, - &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 -} - -unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, - b + (16 * b_stride), b_stride, 64, 16, - &sse2, &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w8(a + (16 * 2 * a_stride), a_stride, - b + (16 * 2 * b_stride), b_stride, - 64, 16, &sse2, &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w8(a + (16 * 3 * a_stride), a_stride, - b + (16 * 3 * b_stride), b_stride, - 64, 16, &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 + 16, bilinear_filters[yoffset]); + return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); } unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, @@ -265,10 +128,10 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, - BILINEAR_FILTERS_2TAP(xoffset)); + bilinear_filters[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, - 32, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse); + 32, bilinear_filters[yoffset]); + return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); } unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, @@ -283,8 +146,8 @@ unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, - BILINEAR_FILTERS_2TAP(xoffset)); + bilinear_filters[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, - 64, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse); + 64, bilinear_filters[yoffset]); + return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_avg_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_avg_msa.c new file mode 100644 index 00000000000..f2e8b275a6a --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_avg_msa.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; + v4u32 sum = { 0 }; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3); + HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7); + ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6); + ADD2(sum0, sum2, sum4, sum6, sum0, sum4); + sum0 += sum4; + + sum = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum); + sum = __msa_hadd_u_w(sum0, sum0); + sum = (v4u32)__msa_srari_w((v4i32)sum, 6); + sum_out = __msa_copy_u_w((v4i32)sum, 0); + + return sum_out; +} + +uint32_t vp9_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + uint32_t src0, src1, src2, src3; + v16u8 vec = { 0 }; + v8u16 sum0; + v4u32 sum1; + v2u64 sum2; + + LW4(src, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, vec); + + sum0 = __msa_hadd_u_h(vec, vec); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum2 = __msa_hadd_u_d(sum1, sum1); + sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4); + sum_out = __msa_copy_u_w((v4i32)sum1, 0); + + return sum_out; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c new file mode 100644 index 00000000000..9709092fcce --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \ +static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr, \ + const int16_t *dq_coeff_ptr, \ + int64_t *ssz) { \ + int64_t err = 0; \ + uint32_t loop_cnt; \ + v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h; \ + v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w; \ + v2i64 sq_coeff_r, sq_coeff_l; \ + v2i64 err0, err_dup0, err1, err_dup1; \ + \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, \ + sq_coeff_r, sq_coeff_l); \ + DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + \ + for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) { \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + } \ + \ + err_dup0 = __msa_splati_d(sq_coeff_r, 1); \ + err_dup1 = __msa_splati_d(sq_coeff_l, 1); \ + sq_coeff_r += err_dup0; \ + sq_coeff_l += err_dup1; \ + *ssz = __msa_copy_s_d(sq_coeff_r, 0); \ + *ssz += __msa_copy_s_d(sq_coeff_l, 0); \ + \ + err_dup0 = __msa_splati_d(err0, 1); \ + err_dup1 = __msa_splati_d(err1, 1); \ + err0 += err_dup0; \ + err1 += err_dup1; \ + err = __msa_copy_s_d(err0, 0); \ + err += __msa_copy_s_d(err1, 0); \ + \ + return err; \ +} + +BLOCK_ERROR_BLOCKSIZE_MSA(16); +BLOCK_ERROR_BLOCKSIZE_MSA(64); +BLOCK_ERROR_BLOCKSIZE_MSA(256); +BLOCK_ERROR_BLOCKSIZE_MSA(1024); + +int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr, + const tran_low_t *dq_coeff_ptr, + intptr_t blk_size, int64_t *ssz) { + int64_t err; + const int16_t *coeff = (const int16_t *)coeff_ptr; + const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr; + + switch (blk_size) { + case 16: + err = block_error_16size_msa(coeff, dq_coeff, ssz); + break; + case 64: + err = block_error_64size_msa(coeff, dq_coeff, ssz); + break; + case 256: + err = block_error_256size_msa(coeff, dq_coeff, ssz); + break; + case 1024: + err = block_error_1024size_msa(coeff, dq_coeff, ssz); + break; + default: + err = vp9_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz); + break; + } + + return err; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c new file mode 100644 index 00000000000..a3ebfab1f7d --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "vp9/encoder/mips/msa/vp9_fdct_msa.h" + +static void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride) { + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; + v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; + v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; + v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, + cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; + v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, + 0, 0, 0, 0 }; + + LD_SH16(input, src_stride, + in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in8, in9, in10, in11, 2); + SLLI_4V(in12, in13, in14, in15, 2); + ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); + ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); + VP9_FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); + SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); + SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); + + tmp_ptr += 16; + + /* stp 1 */ + ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); + ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); + + cnst4 = __msa_splati_h(coeff, 0); + stp25 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); + + cnst5 = __msa_splati_h(coeff, 1); + cnst5 = __msa_ilvev_h(cnst5, cnst4); + stp22 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); + stp24 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); + stp23 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); + + /* stp2 */ + BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); + BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); + ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); + ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); + SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + stp26 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); + + cnst0 = __msa_splati_h(coeff, 4); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + stp21 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); + + BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); + ILVRL_H2_SH(in15, in8, vec1, vec0); + SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + + in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr); + + cnst0 = __msa_splati_h(coeff2, 0); + cnst0 = __msa_ilvev_h(cnst1, cnst0); + in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 224); + + ILVRL_H2_SH(in14, in9, vec1, vec0); + SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); + ST_SH(in8, tmp_ptr + 128); + + cnst1 = __msa_splati_h(coeff2, 2); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 96); + + SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + stp25 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); + + cnst1 = __msa_splati_h(coeff, 3); + cnst1 = __msa_ilvev_h(cnst0, cnst1); + stp22 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); + + /* stp4 */ + ADD2(stp34, stp25, stp33, stp22, in13, in10); + + ILVRL_H2_SH(in13, in10, vec1, vec0); + SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 64); + + cnst0 = __msa_splati_h(coeff2, 1); + cnst0 = __msa_ilvev_h(cnst1, cnst0); + in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 160); + + SUB2(stp34, stp25, stp33, stp22, in12, in11); + ILVRL_H2_SH(in12, in11, vec1, vec0); + SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); + ST_SH(in8, tmp_ptr + 192); + + cnst1 = __msa_splati_h(coeff2, 3); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 32); +} + +static void fdct16x8_1d_row(int16_t *input, int16_t *output) { + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + + LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, + in8, in9, in10, in11, in12, in13, in14, in15); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); + ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); + ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); + SRA_4V(in0, in1, in2, in3, 2); + SRA_4V(in4, in5, in6, in7, 2); + SRA_4V(in8, in9, in10, in11, 2); + SRA_4V(in12, in13, in14, in15, 2); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, + tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); + VP9_FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); + VP9_FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, + tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); + ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); + TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, + tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); + ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); +} + +void vp9_fdct16x16_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); + + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); + } +} + +void vp9_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + out[1] = 0; + + out[0] = VP9_LD_HADD(input, stride); + out[0] += VP9_LD_HADD(input + 8, stride); + out[0] += VP9_LD_HADD(input + 16 * 8, stride); + out[0] += VP9_LD_HADD(input + 16 * 8 + 8, stride); + out[0] >>= 1; +} + +static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride, + const int32_t *const0, int16_t *int_buf) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; + v4i32 k0, k1, k2, k3; + + /* load input data */ + r0 = LD_SH(input); + r15 = LD_SH(input + 15 * stride); + r7 = LD_SH(input + 7 * stride); + r8 = LD_SH(input + 8 * stride); + SLLI_4V(r0, r15, r7, r8, 2); + + /* stage 1 */ + LD_SW2(const0, 4, k0, k1); + LD_SW2(const0 + 8, 4, k2, k3); + VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + + r3 = LD_SH(input + 3 * stride); + r4 = LD_SH(input + 4 * stride); + r11 = LD_SH(input + 11 * stride); + r12 = LD_SH(input + 12 * stride); + SLLI_4V(r3, r4, r11, r12, 2); + + LD_SW2(const0 + 4 * 4, 4, k0, k1); + LD_SW2(const0 + 4 * 6, 4, k2, k3); + VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + + /* stage 2 */ + BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); + ST_SH2(tp0, tp2, int_buf, 8); + ST_SH2(tp1, tp3, int_buf + 4 * 8, 8); + + LD_SW2(const0 + 4 * 8, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 10); + VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + + ST_SH2(h0, h1, int_buf + 8 * 8, 8); + ST_SH2(h3, h2, int_buf + 12 * 8, 8); + + r9 = LD_SH(input + 9 * stride); + r6 = LD_SH(input + 6 * stride); + r1 = LD_SH(input + stride); + r14 = LD_SH(input + 14 * stride); + SLLI_4V(r9, r6, r1, r14, 2); + + LD_SW2(const0 + 4 * 11, 4, k0, k1); + LD_SW2(const0 + 4 * 13, 4, k2, k3); + VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); + + ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); + + r13 = LD_SH(input + 13 * stride); + r2 = LD_SH(input + 2 * stride); + r5 = LD_SH(input + 5 * stride); + r10 = LD_SH(input + 10 * stride); + SLLI_4V(r13, r2, r5, r10, 2); + + LD_SW2(const0 + 4 * 15, 4, k0, k1); + LD_SW2(const0 + 4 * 17, 4, k2, k3); + VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); + + ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); + + BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); + ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); +} + +static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0, + int16_t *out) { + int16_t *out_ptr = out + 128; + v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v4i32 k0, k1, k2, k3; + + LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15); + LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7); + LD_SW2(const0 + 4 * 19, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 21); + VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + + tp0 = LD_SH(int_buf + 4 * 8); + tp1 = LD_SH(int_buf + 5 * 8); + tp3 = LD_SH(int_buf + 10 * 8); + tp2 = LD_SH(int_buf + 14 * 8); + LD_SW2(const0 + 4 * 22, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 24); + VP9_MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + ST_SH(out4, (out + 3 * 16)); + ST_SH(out5, (out_ptr + 4 * 16)); + + h1 = LD_SH(int_buf + 9 * 8); + h3 = LD_SH(int_buf + 12 * 8); + VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + ST_SH(out12, (out + 2 * 16)); + ST_SH(out13, (out_ptr + 5 * 16)); + + tp0 = LD_SH(int_buf); + tp1 = LD_SH(int_buf + 8); + tp2 = LD_SH(int_buf + 2 * 8); + tp3 = LD_SH(int_buf + 6 * 8); + + BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); + out1 = -out1; + ST_SH(out0, (out)); + ST_SH(out1, (out_ptr + 7 * 16)); + + h0 = LD_SH(int_buf + 8 * 8); + h2 = LD_SH(int_buf + 13 * 8); + + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + ST_SH(out8, (out + 16)); + ST_SH(out9, (out_ptr + 6 * 16)); + + /* stage 4 */ + LD_SW2(const0 + 4 * 25, 4, k0, k1); + LD_SW2(const0 + 4 * 27, 4, k2, k3); + VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3); + ST_SH(out2, (out + 7 * 16)); + ST_SH(out3, (out_ptr)); + + VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7); + ST_SH(out6, (out + 4 * 16)); + ST_SH(out7, (out_ptr + 3 * 16)); + + VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11); + ST_SH(out10, (out + 6 * 16)); + ST_SH(out11, (out_ptr + 16)); + + VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15); + ST_SH(out14, (out + 5 * 16)); + ST_SH(out15, (out_ptr + 2 * 16)); +} + +static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, + r0, r1, r2, r3, r4, r5, r6, r7); + VP9_FDCT_POSTPROC_2V_NEG_H(r0, r1); + VP9_FDCT_POSTPROC_2V_NEG_H(r2, r3); + VP9_FDCT_POSTPROC_2V_NEG_H(r4, r5); + VP9_FDCT_POSTPROC_2V_NEG_H(r6, r7); + ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); + out += 64; + + LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, + r8, r9, r10, r11, r12, r13, r14, r15); + VP9_FDCT_POSTPROC_2V_NEG_H(r8, r9); + VP9_FDCT_POSTPROC_2V_NEG_H(r10, r11); + VP9_FDCT_POSTPROC_2V_NEG_H(r12, r13); + VP9_FDCT_POSTPROC_2V_NEG_H(r14, r15); + ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); + out += 64; + + /* load input data */ + input += 128; + LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, + r0, r1, r2, r3, r4, r5, r6, r7); + VP9_FDCT_POSTPROC_2V_NEG_H(r0, r1); + VP9_FDCT_POSTPROC_2V_NEG_H(r2, r3); + VP9_FDCT_POSTPROC_2V_NEG_H(r4, r5); + VP9_FDCT_POSTPROC_2V_NEG_H(r6, r7); + ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); + out += 64; + + LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, + r8, r9, r10, r11, r12, r13, r14, r15); + VP9_FDCT_POSTPROC_2V_NEG_H(r8, r9); + VP9_FDCT_POSTPROC_2V_NEG_H(r10, r11); + VP9_FDCT_POSTPROC_2V_NEG_H(r12, r13); + VP9_FDCT_POSTPROC_2V_NEG_H(r14, r15); + ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); +} + +static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0, + int16_t *int_buf) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; + v4i32 k0, k1, k2, k3; + + /* load input data */ + r0 = LD_SH(input); + r7 = LD_SH(input + 7 * 8); + r8 = LD_SH(input + 8 * 8); + r15 = LD_SH(input + 15 * 8); + + /* stage 1 */ + LD_SW2(const0, 4, k0, k1); + LD_SW2(const0 + 4 * 2, 4, k2, k3); + VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + + r3 = LD_SH(input + 3 * 8); + r4 = LD_SH(input + 4 * 8); + r11 = LD_SH(input + 11 * 8); + r12 = LD_SH(input + 12 * 8); + + LD_SW2(const0 + 4 * 4, 4, k0, k1); + LD_SW2(const0 + 4 * 6, 4, k2, k3); + VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + + /* stage 2 */ + BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); + ST_SH2(tp0, tp1, int_buf, 4 * 8); + ST_SH2(tp2, tp3, int_buf + 8, 4 * 8); + + LD_SW2(const0 + 4 * 8, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 10); + VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8); + ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8); + + r1 = LD_SH(input + 8); + r6 = LD_SH(input + 6 * 8); + r9 = LD_SH(input + 9 * 8); + r14 = LD_SH(input + 14 * 8); + + LD_SW2(const0 + 4 * 11, 4, k0, k1); + LD_SW2(const0 + 4 * 13, 4, k2, k3); + VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); + ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); + + r2 = LD_SH(input + 2 * 8); + r5 = LD_SH(input + 5 * 8); + r10 = LD_SH(input + 10 * 8); + r13 = LD_SH(input + 13 * 8); + + LD_SW2(const0 + 4 * 15, 4, k0, k1); + LD_SW2(const0 + 4 * 17, 4, k2, k3); + VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); + ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); + BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); + ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); +} + +static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0, + int16_t *out) { + int16_t *out_ptr = out + 8; + v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v4i32 k0, k1, k2, k3; + + g13 = LD_SH(int_buf + 3 * 8); + g15 = LD_SH(int_buf + 7 * 8); + g5 = LD_SH(int_buf + 11 * 8); + g7 = LD_SH(int_buf + 15 * 8); + + LD_SW2(const0 + 4 * 19, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 21); + VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + + tp0 = LD_SH(int_buf + 4 * 8); + tp1 = LD_SH(int_buf + 5 * 8); + tp3 = LD_SH(int_buf + 10 * 8); + tp2 = LD_SH(int_buf + 14 * 8); + + LD_SW2(const0 + 4 * 22, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 24); + VP9_MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + ST_SH(out4, (out + 3 * 16)); + ST_SH(out5, (out_ptr + 4 * 16)); + + h1 = LD_SH(int_buf + 9 * 8); + h3 = LD_SH(int_buf + 12 * 8); + VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + ST_SH(out12, (out + 2 * 16)); + ST_SH(out13, (out_ptr + 5 * 16)); + + tp0 = LD_SH(int_buf); + tp1 = LD_SH(int_buf + 8); + tp2 = LD_SH(int_buf + 2 * 8); + tp3 = LD_SH(int_buf + 6 * 8); + + BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); + out1 = -out1; + ST_SH(out0, (out)); + ST_SH(out1, (out_ptr + 7 * 16)); + + h0 = LD_SH(int_buf + 8 * 8); + h2 = LD_SH(int_buf + 13 * 8); + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + ST_SH(out8, (out + 16)); + ST_SH(out9, (out_ptr + 6 * 16)); + + /* stage 4 */ + LD_SW2(const0 + 4 * 25, 4, k0, k1); + LD_SW2(const0 + 4 * 27, 4, k2, k3); + VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3); + ST_SH(out2, (out + 7 * 16)); + ST_SH(out3, (out_ptr)); + + VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7); + ST_SH(out6, (out + 4 * 16)); + ST_SH(out7, (out_ptr + 3 * 16)); + + VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11); + ST_SH(out10, (out + 6 * 16)); + ST_SH(out11, (out_ptr + 16)); + + VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15); + ST_SH(out14, (out + 5 * 16)); + ST_SH(out15, (out_ptr + 2 * 16)); +} + +static void fadst16_transpose_msa(int16_t *input, int16_t *out) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, + l4, l12, l5, l13, l6, l14, l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, + r0, r1, r2, r3, r4, r5, r6, r7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, + r8, r9, r10, r11, r12, r13, r14, r15); + ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); + ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); + out += 16 * 8; + + /* load input data */ + input += 128; + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, + l4, l12, l5, l13, l6, l14, l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, + r0, r1, r2, r3, r4, r5, r6, r7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, + r8, r9, r10, r11, r12, r13, r14, r15); + ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); + ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); +} + +static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) { + int16_t *temp = intermediate; + int16_t *out = output; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11; + v8i16 in12, in13, in14, in15; + + LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7); + temp = intermediate + 8; + LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, + in8, in9, in10, in11, in12, in13, in14, in15); + VP9_FDCT_POSTPROC_2V_NEG_H(in0, in1); + VP9_FDCT_POSTPROC_2V_NEG_H(in2, in3); + VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5); + VP9_FDCT_POSTPROC_2V_NEG_H(in6, in7); + VP9_FDCT_POSTPROC_2V_NEG_H(in8, in9); + VP9_FDCT_POSTPROC_2V_NEG_H(in10, in11); + VP9_FDCT_POSTPROC_2V_NEG_H(in12, in13); + VP9_FDCT_POSTPROC_2V_NEG_H(in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + in8, in9, in10, in11, in12, in13, in14, in15); + temp = intermediate; + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16); + VP9_FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + temp = intermediate; + LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); + VP9_FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, + tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); + ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16); + TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, + tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); + out = output + 8; + ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16); +} + +void vp9_fht16x16_msa(const int16_t *input, int16_t *output, + int32_t stride, int32_t tx_type) { + DECLARE_ALIGNED(32, int16_t, tmp[256]); + DECLARE_ALIGNED(32, int16_t, trans_buf[256]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[128]); + int32_t i; + int16_t *ptmpbuf = &tmp_buf[0]; + int16_t *trans = &trans_buf[0]; + const int32_t const_arr[29 * 4] = { + 52707308, 52707308, 52707308, 52707308, + -1072430300, -1072430300, -1072430300, -1072430300, + 795618043, 795618043, 795618043, 795618043, + -721080468, -721080468, -721080468, -721080468, + 459094491, 459094491, 459094491, 459094491, + -970646691, -970646691, -970646691, -970646691, + 1010963856, 1010963856, 1010963856, 1010963856, + -361743294, -361743294, -361743294, -361743294, + 209469125, 209469125, 209469125, 209469125, + -1053094788, -1053094788, -1053094788, -1053094788, + 1053160324, 1053160324, 1053160324, 1053160324, + 639644520, 639644520, 639644520, 639644520, + -862444000, -862444000, -862444000, -862444000, + 1062144356, 1062144356, 1062144356, 1062144356, + -157532337, -157532337, -157532337, -157532337, + 260914709, 260914709, 260914709, 260914709, + -1041559667, -1041559667, -1041559667, -1041559667, + 920985831, 920985831, 920985831, 920985831, + -551995675, -551995675, -551995675, -551995675, + 596522295, 596522295, 596522295, 596522295, + 892853362, 892853362, 892853362, 892853362, + -892787826, -892787826, -892787826, -892787826, + 410925857, 410925857, 410925857, 410925857, + -992012162, -992012162, -992012162, -992012162, + 992077698, 992077698, 992077698, 992077698, + 759246145, 759246145, 759246145, 759246145, + -759180609, -759180609, -759180609, -759180609, + -759222975, -759222975, -759222975, -759222975, + 759288511, 759288511, 759288511, 759288511 }; + + switch (tx_type) { + case DCT_DCT: + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); + } + break; + case ADST_DCT: + /* column transform */ + for (i = 0; i < 2; ++i) { + fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); + fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); + } + break; + case DCT_ADST: + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); + } + + fadst16_transpose_postproc_msa(tmp, trans); + + /* row transform */ + for (i = 0; i < 2; ++i) { + fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); + fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); + } + + fadst16_transpose_msa(tmp, output); + break; + case ADST_ADST: + /* column transform */ + for (i = 0; i < 2; ++i) { + fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); + fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); + } + + fadst16_transpose_postproc_msa(tmp, trans); + + /* row transform */ + for (i = 0; i < 2; ++i) { + fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); + fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); + } + + fadst16_transpose_msa(tmp, output); + break; + default: + assert(0); + break; + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c new file mode 100644 index 00000000000..3a740232228 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c @@ -0,0 +1,956 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/encoder/mips/msa/vp9_fdct_msa.h" + +static void fdct8x32_1d_column_load_butterfly(const int16_t *input, + int32_t src_stride, + int16_t *temp_buff) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 step0, step1, step2, step3; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + v8i16 step0_1, step1_1, step2_1, step3_1; + + /* 1st and 2nd set */ + LD_SH4(input, src_stride, in0, in1, in2, in3); + LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, + step0, step1, step2, step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff, 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); + + /* 3rd and 4th set */ + LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); + LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, + step0, step1, step2, step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); +} + +static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 temp0, temp1; + + /* fdct even */ + LD_SH4(input, 8, in0, in1, in2, in3); + LD_SH4(input + 96, 8, in12, in13, in14, in15); + BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, + vec0, vec1, vec2, vec3, in12, in13, in14, in15); + LD_SH4(input + 32, 8, in4, in5, in6, in7); + LD_SH4(input + 64, 8, in8, in9, in10, in11); + BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, + vec4, vec5, vec6, vec7, in8, in9, in10, in11); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + VP9_DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp); + ST_SH(temp1, temp + 512); + + VP9_DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 256); + ST_SH(temp1, temp + 768); + + SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); + VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 128); + ST_SH(temp1, temp + 896); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 640); + ST_SH(temp1, temp + 384); + + VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 64); + ST_SH(temp1, temp + 960); + + SUB2(in0, in1, in2, in3, in0, in2); + VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 576); + ST_SH(temp1, temp + 448); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 320); + ST_SH(temp1, temp + 704); + + ADD2(in3, in2, in0, in1, vec3, vec4); + VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 192); + ST_SH(temp1, temp + 832); +} + +static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(input + 32); + in21 = LD_SH(input + 40); + in26 = LD_SH(input + 80); + in27 = LD_SH(input + 88); + + VP9_DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + VP9_DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(input + 16); + in19 = LD_SH(input + 24); + in28 = LD_SH(input + 96); + in29 = LD_SH(input + 104); + + vec4 = in19 - in20; + ST_SH(vec4, input + 32); + vec4 = in18 - in21; + ST_SH(vec4, input + 40); + vec4 = in29 - in26; + ST_SH(vec4, input + 80); + vec4 = in28 - in27; + ST_SH(vec4, input + 88); + + in21 = in18 + in21; + in20 = in19 + in20; + in27 = in28 + in27; + in26 = in29 + in26; + + LD_SH4(input + 48, 8, in22, in23, in24, in25); + VP9_DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + VP9_DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(input); + in17 = LD_SH(input + 8); + in30 = LD_SH(input + 112); + in31 = LD_SH(input + 120); + + vec4 = in17 - in22; + ST_SH(vec4, input + 16); + vec4 = in16 - in23; + ST_SH(vec4, input + 24); + vec4 = in31 - in24; + ST_SH(vec4, input + 96); + vec4 = in30 - in25; + ST_SH(vec4, input + 104); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + VP9_DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + VP9_DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + VP9_DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + VP9_DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr); + ST_SH(vec4, temp_ptr + 960); + + SUB2(in27, in26, in25, in24, in22, in21); + VP9_DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 448); + ST_SH(vec4, temp_ptr + 512); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + VP9_DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + VP9_DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 704); + ST_SH(vec5, temp_ptr + 256); + + ADD2(in26, in27, in24, in25, in22, in21); + VP9_DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 192); + ST_SH(vec5, temp_ptr + 768); + + LD_SH4(input + 16, 8, in22, in23, in20, in21); + LD_SH4(input + 80, 8, in26, in27, in24, in25); + in16 = in20; + in17 = in21; + VP9_DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + VP9_DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + VP9_DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + VP9_DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 832); + ST_SH(vec4, temp_ptr + 128); + + SUB2(in28, in29, in31, in30, in17, in18); + VP9_DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 320); + ST_SH(vec4, temp_ptr + 640); + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + VP9_DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + VP9_DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 576); + ST_SH(vec4, temp_ptr + 384); + + ADD2(in29, in28, in30, in31, in17, in18); + VP9_DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 64); + ST_SH(vec4, temp_ptr + 896); +} + +static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, + int16_t *tmp_buf, int16_t *tmp_buf_big) { + fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); + fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); + fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); +} + +static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, + int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 step0, step1, step2, step3, step4, step5, step6, step7; + + LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, + in8, in9, in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + step0, step1, step2, step3, step4, step5, step6, step7, + in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); + + /* 2nd set */ + LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, + in8, in9, in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + step0, step1, step2, step3, step4, step5, step6, step7, + in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, + (output + 8 * 8), 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); +} + +static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, + int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; + v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; + v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, + in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); + + /* Stage 3 */ + UNPCK_SH_SW(vec0, vec0_l, vec0_r); + UNPCK_SH_SW(vec1, vec1_l, vec1_r); + UNPCK_SH_SW(vec2, vec2_l, vec2_r); + UNPCK_SH_SW(vec3, vec3_l, vec3_r); + UNPCK_SH_SW(vec4, vec4_l, vec4_r); + UNPCK_SH_SW(vec5, vec5_l, vec5_r); + UNPCK_SH_SW(vec6, vec6_l, vec6_r); + UNPCK_SH_SW(vec7, vec7_l, vec7_r); + ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, + tmp0_w, tmp1_w, tmp2_w, tmp3_w); + BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); + ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, + vec0_r, vec1_r, vec2_r, vec3_r); + + tmp3_w = vec0_r + vec3_r; + vec0_r = vec0_r - vec3_r; + vec3_r = vec1_r + vec2_r; + vec1_r = vec1_r - vec2_r; + + VP9_DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, + cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); + VP9_FDCT32_POSTPROC_NEG_W(vec4_r); + VP9_FDCT32_POSTPROC_NEG_W(tmp3_w); + VP9_FDCT32_POSTPROC_NEG_W(vec6_r); + VP9_FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out, 8); + + VP9_DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, + cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); + VP9_FDCT32_POSTPROC_NEG_W(vec4_r); + VP9_FDCT32_POSTPROC_NEG_W(tmp3_w); + VP9_FDCT32_POSTPROC_NEG_W(vec6_r); + VP9_FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out + 16, 8); + + LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); + VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 32); + ST_SH(in5, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); + VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 40); + ST_SH(in5, out + 48); + + LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); + VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 64); + ST_SH(in5, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); + VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 72); + ST_SH(in5, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); + VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 80); + ST_SH(in5, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); + VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 96); + ST_SH(in5, out + 88); +} + +static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, + in8, in9, in10, in11, in12, in13, in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + VP9_DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + VP9_DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5) + VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + VP9_DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + VP9_DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + VP9_DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + VP9_DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + VP9_DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + VP9_DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + VP9_DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + + VP9_DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + + VP9_DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + VP9_DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + + VP9_DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + VP9_DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + VP9_DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + VP9_DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + VP9_DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + VP9_DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + VP9_DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + VP9_DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + + VP9_DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + + VP9_DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + + /* 1st set */ + in0 = LD_SH(temp); + in4 = LD_SH(temp + 32); + in2 = LD_SH(temp + 64); + in6 = LD_SH(temp + 96); + in1 = LD_SH(temp + 128); + in7 = LD_SH(temp + 152); + in3 = LD_SH(temp + 192); + in5 = LD_SH(temp + 216); + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + /* 2nd set */ + in0_1 = LD_SH(temp + 16); + in1_1 = LD_SH(temp + 232); + in2_1 = LD_SH(temp + 80); + in3_1 = LD_SH(temp + 168); + in4_1 = LD_SH(temp + 48); + in5_1 = LD_SH(temp + 176); + in6_1 = LD_SH(temp + 112); + in7_1 = LD_SH(temp + 240); + + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32); + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + /* 3rd set */ + in0 = LD_SH(temp + 8); + in1 = LD_SH(temp + 136); + in2 = LD_SH(temp + 72); + in3 = LD_SH(temp + 200); + in4 = LD_SH(temp + 40); + in5 = LD_SH(temp + 208); + in6 = LD_SH(temp + 104); + in7 = LD_SH(temp + 144); + + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + output + 8, 32); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); + + /* 4th set */ + in0_1 = LD_SH(temp + 24); + in1_1 = LD_SH(temp + 224); + in2_1 = LD_SH(temp + 88); + in3_1 = LD_SH(temp + 160); + in4_1 = LD_SH(temp + 56); + in5_1 = LD_SH(temp + 184); + in6_1 = LD_SH(temp + 120); + in7_1 = LD_SH(temp + 248); + + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + output + 24, 32); +} + +static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(temp, temp_buf); + fdct8x32_1d_row_even(temp_buf, temp_buf); + fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); + fdct8x32_1d_row_transpose_store(temp_buf, output); +} + +static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); + fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vp9_fdct32x32_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, + tmp_buf_big + (8 * i)); + } + + /* row transform */ + fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); + + /* row transform */ + for (i = 1; i < 4; ++i) { + fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); + } +} + +void vp9_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + out[1] = 0; + + out[0] = VP9_LD_HADD(input, stride); + out[0] += VP9_LD_HADD(input + 8, stride); + out[0] += VP9_LD_HADD(input + 16, stride); + out[0] += VP9_LD_HADD(input + 24, stride); + out[0] += VP9_LD_HADD(input + 32 * 8, stride); + out[0] += VP9_LD_HADD(input + 32 * 8 + 8, stride); + out[0] += VP9_LD_HADD(input + 32 * 8 + 16, stride); + out[0] += VP9_LD_HADD(input + 32 * 8 + 24, stride); + out[0] += VP9_LD_HADD(input + 32 * 16, stride); + out[0] += VP9_LD_HADD(input + 32 * 16 + 8, stride); + out[0] += VP9_LD_HADD(input + 32 * 16 + 16, stride); + out[0] += VP9_LD_HADD(input + 32 * 16 + 24, stride); + out[0] += VP9_LD_HADD(input + 32 * 24, stride); + out[0] += VP9_LD_HADD(input + 32 * 24 + 8, stride); + out[0] += VP9_LD_HADD(input + 32 * 24 + 16, stride); + out[0] += VP9_LD_HADD(input + 32 * 24 + 24, stride); + out[0] >>= 3; +} + +static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15, + vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, + in8, in9, in10, in11, in12, in13, in14, in15); + VP9_FDCT_POSTPROC_2V_NEG_H(vec0, vec1); + VP9_FDCT_POSTPROC_2V_NEG_H(vec2, vec3); + VP9_FDCT_POSTPROC_2V_NEG_H(vec4, vec5); + VP9_FDCT_POSTPROC_2V_NEG_H(vec6, vec7); + VP9_FDCT_POSTPROC_2V_NEG_H(in8, in9); + VP9_FDCT_POSTPROC_2V_NEG_H(in10, in11); + VP9_FDCT_POSTPROC_2V_NEG_H(in12, in13); + VP9_FDCT_POSTPROC_2V_NEG_H(in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + + temp0 = in0 + in3; + in0 = in0 - in3; + in3 = in1 + in2; + in1 = in1 - in2; + + VP9_DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + VP9_DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31; + v8i16 vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + VP9_DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + VP9_DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + VP9_FDCT_POSTPROC_2V_NEG_H(in20, in21); + VP9_FDCT_POSTPROC_2V_NEG_H(in26, in27); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + VP9_FDCT_POSTPROC_2V_NEG_H(in18, in19); + VP9_FDCT_POSTPROC_2V_NEG_H(in28, in29); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + VP9_DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + VP9_DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + VP9_FDCT_POSTPROC_2V_NEG_H(in22, in23); + VP9_FDCT_POSTPROC_2V_NEG_H(in24, in25); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + VP9_FDCT_POSTPROC_2V_NEG_H(in16, in17); + VP9_FDCT_POSTPROC_2V_NEG_H(in30, in31); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + VP9_DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + VP9_DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + VP9_DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + VP9_DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + VP9_DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + VP9_DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + VP9_DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + VP9_DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + VP9_DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + VP9_DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + VP9_DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + in16 = in28 + in29; + in19 = in31 + in30; + VP9_DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + VP9_DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + VP9_DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + VP9_DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + VP9_DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); + fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vp9_fdct32x32_rd_msa(const int16_t *input, int16_t *out, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], + &tmp_buf_big[0] + (8 * i)); + } + + /* row transform */ + for (i = 0; i < 4; ++i) { + fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], + out + (8 * i * 32)); + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c new file mode 100644 index 00000000000..790b4fb8d79 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "vp9/encoder/mips/msa/vp9_fdct_msa.h" + +void vp9_fwht4x4_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3, in4; + + LD_SH4(input, src_stride, in0, in1, in2, in3); + + in0 += in1; + in3 -= in2; + in4 = (in0 - in3) >> 1; + SUB2(in4, in1, in4, in2, in1, in2); + in0 -= in2; + in3 += in1; + + TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); + + in0 += in2; + in1 -= in3; + in4 = (in0 - in1) >> 1; + SUB2(in4, in2, in4, in3, in2, in3); + in0 -= in3; + in1 += in2; + + SLLI_4V(in0, in1, in2, in3, 2); + + TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2); + + ST4x2_UB(in0, output, 4); + ST4x2_UB(in3, output + 4, 4); + ST4x2_UB(in1, output + 8, 4); + ST4x2_UB(in2, output + 12, 4); +} + +void vp9_fdct4x4_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3; + + LD_SH4(input, src_stride, in0, in1, in2, in3); + + /* fdct4 pre-process */ + { + v8i16 vec, mask; + v16i8 zero = { 0 }; + v16i8 one = __msa_ldi_b(1); + + mask = (v8i16)__msa_sldi_b(zero, one, 15); + SLLI_4V(in0, in1, in2, in3, 4); + vec = __msa_ceqi_h(in0, 0); + vec = vec ^ 255; + vec = mask & vec; + in0 += vec; + } + + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + SRA_4V(in0, in1, in2, in3, 2); + PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output, 8); +} + +void vp9_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + v8i16 in0, in1, in2, in3; + + LD_SH4(input, stride, in0, in1, in2, in3); + + /* fdct4 pre-process */ + { + v8i16 temp, mask; + v16i8 zero = { 0 }; + v16i8 one = __msa_ldi_b(1); + + mask = (v8i16)__msa_sldi_b(zero, one, 15); + SLLI_4V(in0, in1, in2, in3, 4); + temp = __msa_ceqi_h(in0, 0); + temp = (v8i16)__msa_xori_b((v16u8)temp, 255); + temp = mask & temp; + in0 += temp; + } + + switch (tx_type) { + case DCT_DCT: + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_DCT: + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case DCT_ADST: + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_ADST: + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + default: + assert(0); + break; + } + + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + SRA_4V(in0, in1, in2, in3, 2); + PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output, 8); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c new file mode 100644 index 00000000000..68e65723abb --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "vp9/encoder/mips/msa/vp9_fdct_msa.h" + +void vp9_fdct8x8_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); +} + +void vp9_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + out[0] = VP9_LD_HADD(input, stride); + out[1] = 0; +} + +void vp9_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + + switch (tx_type) { + case DCT_DCT: + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + case ADST_DCT: + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + case DCT_ADST: + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + case ADST_ADST: + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + break; + default: + assert(0); + break; + } + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + VP9_SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h new file mode 100644 index 00000000000..ad66576b6e5 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h @@ -0,0 +1,548 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ +#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ + +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \ + v8i16 k0_m = __msa_fill_h(cnst0); \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + s0_m = (v4i32)__msa_fill_h(cnst1); \ + k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ + \ + ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ + ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ + DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + \ + DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ +} + +#define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \ + dst0, dst1, dst2, dst3) { \ + v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ + v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ + \ + DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \ + tp0_m, tp2_m, tp3_m, tp4_m); \ + DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \ + tp5_m, tp6_m, tp7_m, tp8_m); \ + BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ + BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ + SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ + SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \ + dst0, dst1, dst2, dst3); \ +} + +#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \ + v8i16 dst_m; \ + v4i32 tp0_m, tp1_m; \ + \ + DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ + SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \ + dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ + \ + dst_m; \ +}) + +#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ + cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst1_m, cnst2_m, cnst3_m, in7, in0, \ + in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst1_m, cnst2_m, cnst3_m, in5, in2, \ + in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ + cnst2_m, cnst3_m, cnst1_m, out1, out6, \ + s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ +} + +#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) { \ + v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ + v8i16 madd_s0_m, madd_s1_m; \ + \ + ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \ + c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \ + SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ +} + +#define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ + out0, out1, out2, out3) { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ + cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ + m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ + cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ + m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ +} + +#define VP9_LD_HADD(psrc, stride) ({ \ + v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ + v4i32 vec_w_m; \ + \ + LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ + ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ + LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ + ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, \ + in4_m, in6_m, in0_m, in4_m); \ + in0_m += in4_m; \ + \ + vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ + HADD_SW_S32(vec_w_m); \ +}) + +#define VP9_FDCT_POSTPROC_2V_NEG_H(vec0, vec1) { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one_m = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clti_s_h(vec0, 0); \ + tp1_m = __msa_clti_s_h(vec1, 0); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one_m & tp0_m; \ + tp1_m = one_m & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ +} + +#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ + v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \ + cospi_24_64, -cospi_8_64, 0, 0, 0 }; \ + \ + BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ + ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \ + cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \ + vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \ + cnst2_m = __msa_splati_h(coeff_m, 2); \ + cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \ + vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, \ + vec7_m, vec7_m, out0, out2, out1, out3); \ +} + +#define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \ + v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \ + \ + UNPCK_R_SH_SW(in0, in0_r_m); \ + UNPCK_R_SH_SW(in1, in1_r_m); \ + UNPCK_R_SH_SW(in2, in2_r_m); \ + UNPCK_R_SH_SW(in3, in3_r_m); \ + \ + constant_m = __msa_fill_w(sinpi_4_9); \ + MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m); \ + \ + constant_m = __msa_fill_w(sinpi_1_9); \ + s0_m += in0_r_m * constant_m; \ + s1_m -= in1_r_m * constant_m; \ + \ + constant_m = __msa_fill_w(sinpi_2_9); \ + s0_m += in1_r_m * constant_m; \ + s1_m += in3_r_m * constant_m; \ + \ + s2_m = in0_r_m + in1_r_m - in3_r_m; \ + \ + constant_m = __msa_fill_w(sinpi_3_9); \ + MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m); \ + \ + in0_r_m = s0_m + s3_m; \ + s2_m = s1_m - s3_m; \ + s3_m = s1_m - s0_m + s3_m; \ + \ + SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, \ + s3_m, s3_m, out0, out1, out2, out3); \ +} + +#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ + v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \ + cospi_24_64, cospi_4_64, cospi_28_64, \ + cospi_12_64, cospi_20_64 }; \ + \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ + s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = VP9_DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + out0 = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + \ + s6_m = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = VP9_DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = VP9_DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + \ + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ +} + +#define VP9_SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) { \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \ + SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \ + AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, \ + in0, in1, in2, in3); \ + AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, \ + in4, in5, in6, in7); \ +} + +#define VP9_FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v8i16 x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ + \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ + s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = VP9_DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + out0 = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + \ + s6_m = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = VP9_DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = VP9_DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + \ + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ +} + +#define VP9_FDCT8x16_ODD(input0, input1, input2, input3, \ + input4, input5, input6, input7, \ + out1, out3, out5, out7, \ + out9, out11, out13, out15) { \ + v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ + v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ + v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \ + v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ + v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \ + cospi_24_64, -cospi_8_64, -cospi_24_64, \ + cospi_12_64, cospi_20_64 }; \ + v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, \ + cospi_18_64, cospi_10_64, cospi_22_64, \ + cospi_6_64, cospi_26_64 }; \ + v8i16 coeff2_m = { -cospi_2_64, -cospi_10_64, -cospi_18_64, \ + -cospi_26_64, 0, 0, 0, 0 }; \ + \ + /* stp 1 */ \ + ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \ + ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \ + \ + cnst4_m = __msa_splati_h(coeff_m, 0); \ + stp25_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \ + \ + cnst5_m = __msa_splati_h(coeff_m, 1); \ + cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \ + stp22_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \ + stp24_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \ + stp23_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \ + \ + /* stp2 */ \ + BUTTERFLY_4(input0, input1, stp22_m, stp23_m, \ + stp30_m, stp31_m, stp32_m, stp33_m); \ + BUTTERFLY_4(input7, input6, stp25_m, stp24_m, \ + stp37_m, stp36_m, stp35_m, stp34_m); \ + \ + ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \ + ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp26_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 4); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp21_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp25_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 3); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp22_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + /* stp4 */ \ + BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, \ + vec6_m, vec2_m, vec4_m, vec5_m); \ + BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, \ + stp21_m, stp23_m, stp24_m, stp31_m); \ + \ + ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + \ + out1 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 0); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out15 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out9 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 2); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out7 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 1); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out11 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out13 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 3); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ +} + +#define VP9_FDCT32_POSTPROC_NEG_W(vec) { \ + v4i32 temp_m; \ + v4i32 one_m = __msa_ldi_w(1); \ + \ + temp_m = __msa_clti_s_w(vec, 0); \ + vec += 1; \ + temp_m = one_m & temp_m; \ + vec += temp_m; \ + vec >>= 2; \ +} + +#define VP9_FDCT32_POSTPROC_2V_POS_H(vec0, vec1) { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clei_s_h(vec0, 0); \ + tp1_m = __msa_clei_s_h(vec1, 0); \ + tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ + tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one & tp0_m; \ + tp1_m = one & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ +} + +#define VP9_DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, \ + reg1_right, const0, const1, \ + out0, out1, out2, out3) { \ + v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ + v4i32 k0_m = __msa_fill_w((int32_t) const0); \ + \ + s0_m = __msa_fill_w((int32_t) const1); \ + k0_m = __msa_ilvev_w(s0_m, k0_m); \ + \ + ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ + ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ + ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ + ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ + \ + DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + \ + DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ +} +#endif /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */ diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_subtract_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_subtract_msa.c new file mode 100644 index 00000000000..1b8b694ce38 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_subtract_msa.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t src0, src1, src2, src3; + uint32_t pred0, pred1, pred2, pred3; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + LW4(src_ptr, src_stride, src0, src1, src2, src3); + LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3); + INSERT_W4_SB(src0, src1, src2, src3, src); + INSERT_W4_SB(pred0, pred1, pred2, pred3, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride)); +} + +static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t loop_cnt; + uint64_t src0, src1, pred0, pred1; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 4; loop_cnt--;) { + LD2(src_ptr, src_stride, src0, src1); + src_ptr += (2 * src_stride); + LD2(pred_ptr, pred_stride, pred0, pred1); + pred_ptr += (2 * pred_stride); + + INSERT_D2_SB(src0, src1, src); + INSERT_D2_SB(pred0, pred1, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff_ptr, diff_stride); + diff_ptr += (2 * diff_stride); + } +} + +static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + int8_t count; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (count = 2; count--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + LD_SB8(pred, pred_stride, + pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7); + pred += (8 * pred_stride); + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + } +} + +static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 8; loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src += src_stride; + LD_SB2(src, 16, src2, src3); + src += src_stride; + LD_SB2(src, 16, src4, src5); + src += src_stride; + LD_SB2(src, 16, src6, src7); + src += src_stride; + + LD_SB2(pred, 16, pred0, pred1); + pred += pred_stride; + LD_SB2(pred, 16, pred2, pred3); + pred += pred_stride; + LD_SB2(pred, 16, pred4, pred5); + pred += pred_stride; + LD_SB2(pred, 16, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + } +} + +static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 32; loop_cnt--;) { + LD_SB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_SB4(src, 16, src4, src5, src6, src7); + src += src_stride; + + LD_SB4(pred, 16, pred0, pred1, pred2, pred3); + pred += pred_stride; + LD_SB4(pred, 16, pred4, pred5, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + } +} + +void vp9_subtract_block_msa(int32_t rows, int32_t cols, + int16_t *diff_ptr, ptrdiff_t diff_stride, + const uint8_t *src_ptr, ptrdiff_t src_stride, + const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + if (rows == cols) { + switch (rows) { + case 4: + sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, + diff_ptr, diff_stride); + break; + case 8: + sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, + diff_ptr, diff_stride); + break; + case 16: + sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, + diff_ptr, diff_stride); + break; + case 32: + sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, + diff_ptr, diff_stride); + break; + case 64: + sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, + diff_ptr, diff_stride); + break; + default: + vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } + } else { + vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c new file mode 100644 index 00000000000..4053bffaef2 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, + uint32_t stride, + uint8_t *frm2_ptr, + int32_t filt_sth, + int32_t filt_wgt, + uint32_t *acc, + uint16_t *cnt) { + uint32_t row; + uint64_t f0, f1, f2, f3; + v16i8 frm2, frm1 = { 0 }; + v16i8 frm4, frm3 = { 0 }; + v16u8 frm_r, frm_l; + v8i16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 2; row--;) { + LD4(frm1_ptr, stride, f0, f1, f2, f3); + frm1_ptr += (4 * stride); + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 32; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + INSERT_D2_SB(f0, f1, frm1); + INSERT_D2_SB(f2, f3, frm3); + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + UNPCK_UB_SH(frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + } +} + +static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, + uint32_t stride, + uint8_t *frm2_ptr, + int32_t filt_sth, + int32_t filt_wgt, + uint32_t *acc, + uint16_t *cnt) { + uint32_t row; + v16i8 frm1, frm2, frm3, frm4; + v16u8 frm_r, frm_l; + v16i8 zero = { 0 }; + v8u16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 8; row--;) { + LD_SB2(frm1_ptr, stride, frm1, frm3); + frm1_ptr += stride; + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 16; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + frm1_ptr += stride; + frm2_ptr += 16; + } +} + +void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, + uint8_t *frame2_ptr, uint32_t blk_w, + uint32_t blk_h, int32_t strength, + int32_t filt_wgt, uint32_t *accu, + uint16_t *cnt) { + if (8 == (blk_w * blk_h)) { + temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, + strength, filt_wgt, accu, cnt); + } else if (16 == (blk_w * blk_h)) { + temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, + strength, filt_wgt, accu, cnt); + } else { + vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, + strength, filt_wgt, accu, cnt); + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c index 9622ba1d67e..bea7653d2a7 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c @@ -11,6 +11,7 @@ #include <limits.h> #include <math.h> +#include "vp9/encoder/vp9_aq_complexity.h" #include "vp9/encoder/vp9_aq_variance.h" #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/common/vp9_seg_common.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h index c0dce6c5b7d..e9acb1ca504 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h @@ -16,6 +16,8 @@ extern "C" { #endif +#include "vp9/common/vp9_enums.h" + struct VP9_COMP; struct macroblock; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c index 24b427df575..6270bf45293 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -39,6 +39,8 @@ struct CYCLIC_REFRESH { int rdmult; // Cyclic refresh map. signed char *map; + // Map of the last q a block was coded at. + uint8_t *last_coded_q_map; // Thresholds applied to the projected rate/distortion of the coding block, // when deciding whether block should be refreshed. int64_t thresh_rate_sb; @@ -48,12 +50,14 @@ struct CYCLIC_REFRESH { int16_t motion_thresh; // Rate target ratio to set q delta. double rate_ratio_qdelta; + // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2. + int rate_boost_fac; double low_content_avg; - int qindex_delta_seg1; - int qindex_delta_seg2; + int qindex_delta[3]; }; CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { + size_t last_coded_q_map_size; CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr)); if (cr == NULL) return NULL; @@ -63,12 +67,21 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { vpx_free(cr); return NULL; } + last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map); + cr->last_coded_q_map = vpx_malloc(last_coded_q_map_size); + if (cr->last_coded_q_map == NULL) { + vpx_free(cr); + return NULL; + } + assert(MAXQ <= 255); + memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size); return cr; } void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) { vpx_free(cr->map); + vpx_free(cr->last_coded_q_map); vpx_free(cr); } @@ -116,7 +129,8 @@ static int candidate_refresh_aq(const CYCLIC_REFRESH *cr, else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb && is_inter_block(mbmi) && - mbmi->mv[0].as_int == 0) + mbmi->mv[0].as_int == 0 && + cr->rate_boost_fac > 10) // More aggressive delta-q for bigger blocks with zero motion. return CR_SEGMENT_ID_BOOST2; else @@ -157,11 +171,11 @@ int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi, correction_factor, cm->bit_depth) + weight_segment1 * vp9_estimate_bits_at_q(cm->frame_type, - cm->base_qindex + cr->qindex_delta_seg1, mbs, + cm->base_qindex + cr->qindex_delta[1], mbs, correction_factor, cm->bit_depth) + weight_segment2 * vp9_estimate_bits_at_q(cm->frame_type, - cm->base_qindex + cr->qindex_delta_seg2, mbs, + cm->base_qindex + cr->qindex_delta[2], mbs, correction_factor, cm->bit_depth)); return estimated_bits; } @@ -246,9 +260,16 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, // copy mbmi->segment_id into global segmentation map. for (y = 0; y < ymis; y++) for (x = 0; x < xmis; x++) { - cr->map[block_index + y * cm->mi_cols + x] = new_map_value; - cpi->segmentation_map[block_index + y * cm->mi_cols + x] = - mbmi->segment_id; + int map_offset = block_index + y * cm->mi_cols + x; + cr->map[map_offset] = new_map_value; + cpi->segmentation_map[map_offset] = mbmi->segment_id; + // Inter skip blocks were clearly not coded at the current qindex, so + // don't update the map for them. For cases where motion is non-zero or + // the reference frame isn't the previous frame, the previous value in + // the map for this spatial location is not entirely correct. + if (!is_inter_block(mbmi) || !skip) + cr->last_coded_q_map[map_offset] = clamp( + cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ); } } @@ -357,7 +378,7 @@ void vp9_cyclic_refresh_check_golden_update(VP9_COMP *const cpi) { // 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock. // Blocks labeled as BOOST1 may later get set to BOOST2 (during the // encoding of the superblock). -void vp9_cyclic_refresh_update_map(VP9_COMP *const cpi) { +static void cyclic_refresh_update_map(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; unsigned char *const seg_map = cpi->segmentation_map; @@ -382,6 +403,10 @@ void vp9_cyclic_refresh_update_map(VP9_COMP *const cpi) { int sb_col_index = i - sb_row_index * sb_cols; int mi_row = sb_row_index * MI_BLOCK_SIZE; int mi_col = sb_col_index * MI_BLOCK_SIZE; + int qindex_thresh = + cpi->oxcf.content == VP9E_CONTENT_SCREEN + ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex) + : 0; assert(mi_row >= 0 && mi_row < cm->mi_rows); assert(mi_col >= 0 && mi_col < cm->mi_cols); bl_index = mi_row * cm->mi_cols + mi_col; @@ -397,7 +422,8 @@ void vp9_cyclic_refresh_update_map(VP9_COMP *const cpi) { // for possible boost/refresh (segment 1). The segment id may get // reset to 0 later if block gets coded anything other than ZEROMV. if (cr->map[bl_index2] == 0) { - sum_map++; + if (cr->last_coded_q_map[bl_index2] > qindex_thresh) + sum_map++; } else if (cr->map[bl_index2] < 0) { cr->map[bl_index2]++; } @@ -420,18 +446,30 @@ void vp9_cyclic_refresh_update_map(VP9_COMP *const cpi) { cr->sb_index = i; } -// Set/update global/frame level cyclic refresh parameters. +// Set cyclic refresh parameters. void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; + const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; cr->percent_refresh = 10; + cr->max_qdelta_perc = 50; + cr->time_for_refresh = 0; // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4) - // periods of the refresh cycle, after a key frame. This corresponds to ~40 - // frames with cr->percent_refresh = 10. - if (rc->frames_since_key < 40) + // periods of the refresh cycle, after a key frame. + if (rc->frames_since_key < 4 * cr->percent_refresh) cr->rate_ratio_qdelta = 3.0; else cr->rate_ratio_qdelta = 2.0; + // Adjust some parameters for low resolutions at low bitrates. + if (cm->width <= 352 && + cm->height <= 288 && + rc->avg_frame_bandwidth < 3400) { + cr->motion_thresh = 4; + cr->rate_boost_fac = 10; + } else { + cr->motion_thresh = 32; + cr->rate_boost_fac = 17; + } } // Setup cyclic background refresh: set delta q and segmentation map. @@ -452,16 +490,17 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { unsigned char *const seg_map = cpi->segmentation_map; memset(seg_map, 0, cm->mi_rows * cm->mi_cols); vp9_disable_segmentation(&cm->seg); - if (cm->frame_type == KEY_FRAME) + if (cm->frame_type == KEY_FRAME) { + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); cr->sb_index = 0; + } return; } else { int qindex_delta = 0; int qindex2; const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth); vp9_clear_system_state(); - cr->max_qdelta_perc = 50; - cr->time_for_refresh = 0; // Set rate threshold to some multiple (set to 2 for now) of the target // rate (target is given by sb64_target_rate and scaled by 256). cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2; @@ -469,7 +508,7 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { // q will not exceed 457, so (q * q) is within 32bit; see: // vp9_convert_qindex_to_q(), vp9_ac_quant(), ac_qlookup*[]. cr->thresh_dist_sb = ((int64_t)(q * q)) << 2; - cr->motion_thresh = 32; + // Set up segmentation. // Clear down the segment map. vp9_enable_segmentation(&cm->seg); @@ -493,7 +532,7 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { // Set the q delta for segment BOOST1. qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta); - cr->qindex_delta_seg1 = qindex_delta; + cr->qindex_delta[1] = qindex_delta; // Compute rd-mult for segment BOOST1. qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ); @@ -503,17 +542,24 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta); // Set a more aggressive (higher) q delta for segment BOOST2. - qindex_delta = compute_deltaq(cpi, cm->base_qindex, - MIN(CR_MAX_RATE_TARGET_RATIO, - CR_BOOST2_FAC * cr->rate_ratio_qdelta)); - cr->qindex_delta_seg2 = qindex_delta; + qindex_delta = compute_deltaq( + cpi, cm->base_qindex, MIN(CR_MAX_RATE_TARGET_RATIO, + 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta)); + cr->qindex_delta[2] = qindex_delta; vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); // Update the segmentation and refresh map. - vp9_cyclic_refresh_update_map(cpi); + cyclic_refresh_update_map(cpi); } } int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) { return cr->rdmult; } + +void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) { + const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + memset(cr->map, 0, cm->mi_rows * cm->mi_cols); + cr->sb_index = 0; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h index 21f114b5e59..29d2a91bc69 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -27,9 +27,6 @@ extern "C" { // Maximum rate target ratio for setting segment delta-qp. #define CR_MAX_RATE_TARGET_RATIO 4.0 -// Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2. -#define CR_BOOST2_FAC 1.7 - struct VP9_COMP; struct CYCLIC_REFRESH; @@ -78,6 +75,8 @@ void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi); int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr); +void vp9_cyclic_refresh_reset_resize(struct VP9_COMP *const cpi); + static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) { return segment_id == CR_SEGMENT_ID_BOOST1 || segment_id == CR_SEGMENT_ID_BOOST2; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c index be6f7e4ee53..f072717f1d9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c @@ -10,6 +10,8 @@ #include <math.h> +#include "vpx_ports/mem.h" + #include "vp9/encoder/vp9_aq_variance.h" #include "vp9/common/vp9_seg_common.h" @@ -80,6 +82,61 @@ void vp9_vaq_frame_setup(VP9_COMP *cpi) { } } +/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions + * of variance() and highbd_8_variance(). It should not. + */ +static void aq_variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void aq_highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, uint64_t *sse, uint64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +static void aq_highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} +#endif // CONFIG_VP9_HIGHBITDEPTH static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { @@ -96,18 +153,18 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, int avg; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_variance(x->plane[0].src.buf, x->plane[0].src.stride, - CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, - &sse, &avg); + aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, + &sse, &avg); sse >>= 2 * (xd->bd - 8); avg >>= (xd->bd - 8); } else { - variance(x->plane[0].src.buf, x->plane[0].src.stride, - vp9_64_zeros, 0, bw, bh, &sse, &avg); + aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, bw, bh, &sse, &avg); } #else - variance(x->plane[0].src.buf, x->plane[0].src.stride, - vp9_64_zeros, 0, bw, bh, &sse, &avg); + aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, bw, bh, &sse, &avg); #endif // CONFIG_VP9_HIGHBITDEPTH var = sse - (((int64_t)avg * avg) / (bw * bh)); return (256 * var) / (bw * bh); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c index 95b13bb7718..3ef3882d280 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" #include "vpx_ports/mem.h" @@ -28,6 +29,8 @@ unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) { return (sum + 8) >> 4; } +// src_diff: first pass, 9 bit, dynamic range [-255, 255] +// second pass, 12 bit, dynamic range [-2040, 2040] static void hadamard_col8(const int16_t *src_diff, int src_stride, int16_t *coeff) { int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; @@ -64,15 +67,18 @@ void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, int16_t buffer[64]; int16_t *tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { - hadamard_col8(src_diff, src_stride, tmp_buf); + hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit + // dynamic range [-255, 255] tmp_buf += 8; ++src_diff; } tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { - hadamard_col8(tmp_buf, 8, coeff); - coeff += 8; + hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + coeff += 8; // coeff: 15 bit + // dynamic range [-16320, 16320] ++tmp_buf; } } @@ -82,58 +88,68 @@ void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); } + // coeff: 15 bit, dynamic range [-16320, 16320] for (idx = 0; idx < 64; ++idx) { int16_t a0 = coeff[0]; int16_t a1 = coeff[64]; int16_t a2 = coeff[128]; int16_t a3 = coeff[192]; - int16_t b0 = a0 + a1; - int16_t b1 = a0 - a1; - int16_t b2 = a2 + a3; - int16_t b3 = a2 - a3; + int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + int16_t b3 = (a2 - a3) >> 1; - coeff[0] = (b0 + b2) >> 1; - coeff[64] = (b1 + b3) >> 1; - coeff[128] = (b0 - b2) >> 1; - coeff[192] = (b1 - b3) >> 1; + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; ++coeff; } } +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. int16_t vp9_satd_c(const int16_t *coeff, int length) { int i; int satd = 0; for (i = 0; i < length; ++i) satd += abs(coeff[i]); + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] return (int16_t)satd; } // Integer projection onto row vectors. -void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, +// height: value range {16, 32, 64}. +void vp9_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int idx; - const int norm_factor = MAX(8, height >> 1); + const int norm_factor = height >> 1; for (idx = 0; idx < 16; ++idx) { int i; hbuf[idx] = 0; + // hbuf[idx]: 14 bit, dynamic range [0, 16320]. for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; + // hbuf[idx]: 9 bit, dynamic range [0, 510]. hbuf[idx] /= norm_factor; ++ref; } } +// width: value range {16, 32, 64}. int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) { int idx; int16_t sum = 0; + // sum: 14 bit, dynamic range [0, 16320] for (idx = 0; idx < width; ++idx) sum += ref[idx]; return sum; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c index d20e067669f..1ebdd066bfe 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c @@ -93,7 +93,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm, static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd, int segment_id, const MODE_INFO *mi, vp9_writer *w) { - if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { const int skip = mi->mbmi.skip; @@ -207,10 +207,10 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd, // If segment level coding of this signal is disabled... // or the segment allows multiple reference frame options - if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { assert(!is_compound); assert(mbmi->ref_frame[0] == - vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); + get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); } else { // does the feature use compound prediction or not // (if not specified at the frame/segment level) @@ -264,7 +264,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, skip = write_skip(cm, xd, segment_id, mi, w); - if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd)); if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && @@ -293,7 +293,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, write_ref_frames(cm, xd, w); // If segment skip is not enabled code the mode. - if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { + if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { if (bsize >= BLOCK_8X8) { write_inter_mode(w, mode, inter_probs); } @@ -403,7 +403,7 @@ static void write_partition(const VP9_COMMON *const cm, int hbs, int mi_row, int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) { const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); - const vp9_prob *const probs = get_partition_probs(cm, ctx); + const vp9_prob *const probs = xd->partition_probs[ctx]; const int has_rows = (mi_row + hbs) < cm->mi_rows; const int has_cols = (mi_col + hbs) < cm->mi_cols; @@ -481,9 +481,12 @@ static void write_modes_sb(VP9_COMP *cpi, static void write_modes(VP9_COMP *cpi, const TileInfo *const tile, vp9_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) { + const VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; int mi_row, mi_col; + set_partition_probs(cm, xd); + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += MI_BLOCK_SIZE) { vp9_zero(xd->left_seg_context); @@ -787,10 +790,10 @@ static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd, for (i = 0; i < MAX_SEGMENTS; i++) { for (j = 0; j < SEG_LVL_MAX; j++) { - const int active = vp9_segfeature_active(seg, i, j); + const int active = segfeature_active(seg, i, j); vp9_wb_write_bit(wb, active); if (active) { - const int data = vp9_get_segdata(seg, i, j); + const int data = get_segdata(seg, i, j); const int data_max = vp9_seg_feature_data_max(j); if (vp9_is_segfeature_signed(j)) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c index 9e6ca3d594c..414d2bb1502 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_systemdependent.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c index 0e74784e9b6..b4059752ee7 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c @@ -13,8 +13,10 @@ #include <stdio.h> #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" +#include "vpx_ports/mem.h" #include "vpx_ports/vpx_timer.h" #include "vp9/common/vp9_common.h" @@ -360,7 +362,7 @@ static void get_variance(var *v) { ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count); } -void sum_2_variances(const var *a, const var *b, var *r) { +static void sum_2_variances(const var *a, const var *b, var *r) { assert(a->log2_count == b->log2_count); fill_variance(a->sum_square_error + b->sum_square_error, a->sum_error + b->sum_error, a->log2_count + 1, r); @@ -368,6 +370,7 @@ void sum_2_variances(const var *a, const var *b, var *r) { static void fill_variance_tree(void *data, BLOCK_SIZE bsize) { variance_node node; + memset(&node, 0, sizeof(node)); tree_to_node(data, bsize, &node); sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]); sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]); @@ -462,46 +465,55 @@ static int set_vt_partitioning(VP9_COMP *cpi, return 0; } -void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) { +// Set the variance split thresholds for following the block sizes: +// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16, +// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is +// currently only used on key frame. +static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { + VP9_COMMON *const cm = &cpi->common; + const int is_key_frame = (cm->frame_type == KEY_FRAME); + const int threshold_multiplier = is_key_frame ? 20 : 1; + const int64_t threshold_base = (int64_t)(threshold_multiplier * + cpi->y_dequant[q][1]); + if (is_key_frame) { + thresholds[0] = threshold_base; + thresholds[1] = threshold_base >> 2; + thresholds[2] = threshold_base >> 2; + thresholds[3] = threshold_base << 2; + } else { + thresholds[1] = threshold_base; + if (cm->width <= 352 && cm->height <= 288) { + thresholds[0] = threshold_base >> 2; + thresholds[2] = threshold_base << 3; + } else { + thresholds[0] = threshold_base; + thresholds[1] = (5 * threshold_base) >> 2; + if (cm->width >= 1920 && cm->height >= 1080) + thresholds[1] = (7 * threshold_base) >> 2; + thresholds[2] = threshold_base << cpi->oxcf.speed; + } + } +} + +void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) { + VP9_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; + const int is_key_frame = (cm->frame_type == KEY_FRAME); if (sf->partition_search_type != VAR_BASED_PARTITION && sf->partition_search_type != REFERENCE_PARTITION) { return; } else { - VP9_COMMON *const cm = &cpi->common; - const int is_key_frame = (cm->frame_type == KEY_FRAME); - const int threshold_multiplier = is_key_frame ? 20 : 1; - const int64_t threshold_base = (int64_t)(threshold_multiplier * - cpi->y_dequant[q][1]); - - // TODO(marpan): Allow 4x4 partitions for inter-frames. - // use_4x4_partition = (variance4x4downsample[i2 + j] == 1); - // If 4x4 partition is not used, then 8x8 partition will be selected - // if variance of 16x16 block is very high, so use larger threshold - // for 16x16 (threshold_bsize_min) in that case. - - // Array index: 0 - threshold_64x64; 1 - threshold_32x32; - // 2 - threshold_16x16; 3 - vbp_threshold_8x8; + set_vbp_thresholds(cpi, cpi->vbp_thresholds, q); + // The thresholds below are not changed locally. if (is_key_frame) { - cpi->vbp_thresholds[0] = threshold_base; - cpi->vbp_thresholds[1] = threshold_base >> 2; - cpi->vbp_thresholds[2] = threshold_base >> 2; - cpi->vbp_thresholds[3] = threshold_base << 2; cpi->vbp_threshold_sad = 0; cpi->vbp_bsize_min = BLOCK_8X8; } else { - cpi->vbp_thresholds[1] = threshold_base; - if (cm->width <= 352 && cm->height <= 288) { - cpi->vbp_thresholds[0] = threshold_base >> 2; - cpi->vbp_thresholds[2] = threshold_base << 3; + if (cm->width <= 352 && cm->height <= 288) cpi->vbp_threshold_sad = 100; - } else { - cpi->vbp_thresholds[0] = threshold_base; - cpi->vbp_thresholds[1] = (5 * threshold_base) >> 2; - cpi->vbp_thresholds[2] = threshold_base << cpi->oxcf.speed; + else cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 ? (cpi->y_dequant[q][1] << 1) : 1000; - } cpi->vbp_bsize_min = BLOCK_16X16; } cpi->vbp_threshold_minmax = 15 + (q >> 3); @@ -550,23 +562,6 @@ static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d, return (minmax_max - minmax_min); } -static void modify_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { - VP9_COMMON *const cm = &cpi->common; - const int64_t threshold_base = (int64_t)(cpi->y_dequant[q][1]); - - // Array index: 0 - threshold_64x64; 1 - threshold_32x32; - // 2 - threshold_16x16; 3 - vbp_threshold_8x8; - thresholds[1] = threshold_base; - if (cm->width <= 352 && cm->height <= 288) { - thresholds[0] = threshold_base >> 2; - thresholds[2] = threshold_base << 3; - } else { - thresholds[0] = threshold_base; - thresholds[1] = (5 * threshold_base) >> 2; - thresholds[2] = threshold_base << cpi->oxcf.speed; - } -} - static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d, int dp, int x8_idx, int y8_idx, v8x8 *vst, #if CONFIG_VP9_HIGHBITDEPTH @@ -679,7 +674,7 @@ static int choose_partitioning(VP9_COMP *cpi, if (cyclic_refresh_segment_id_boosted(segment_id)) { int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); - modify_vbp_thresholds(cpi, thresholds, q); + set_vbp_thresholds(cpi, thresholds, q); } } @@ -693,17 +688,28 @@ static int choose_partitioning(VP9_COMP *cpi, s = x->plane[0].src.buf; sp = x->plane[0].src.stride; - if (!is_key_frame) { + if (!is_key_frame && !(is_one_pass_cbr_svc(cpi) && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) { + // In the case of spatial/temporal scalable coding, the assumption here is + // that the temporal reference frame will always be of type LAST_FRAME. + // TODO(marpan): If that assumption is broken, we need to revisit this code. MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; unsigned int uv_sad; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); - const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + const YV12_BUFFER_CONFIG *yv12_g = NULL; unsigned int y_sad, y_sad_g; const BLOCK_SIZE bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows); assert(yv12 != NULL); + + if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id)) { + // For now, GOLDEN will not be used for non-zero spatial layers, since + // it may not be a temporal reference. + yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } + if (yv12_g && yv12_g != yv12) { vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, &cm->frame_refs[GOLDEN_FRAME - 1].sf); @@ -723,7 +729,7 @@ static int choose_partitioning(VP9_COMP *cpi, mbmi->mv[0].as_int = 0; mbmi->interp_filter = BILINEAR; - y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize); + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); if (y_sad_g < y_sad) { vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, &cm->frame_refs[GOLDEN_FRAME - 1].sf); @@ -838,7 +844,9 @@ static int choose_partitioning(VP9_COMP *cpi, } } } - if (is_key_frame || (low_res && + // TODO(marpan): There is an issue with variance based on 4x4 average in + // svc mode, don't allow it for now. + if (is_key_frame || (low_res && !cpi->use_svc && vt.split[i].split[j].part_variances.none.variance > (thresholds[1] << 1))) { force_split[split_index] = 0; @@ -1043,7 +1051,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, if (!output_enabled) return; - if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + if (!segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { for (i = 0; i < TX_MODES; i++) rdc->tx_select_diff[i] += ctx->tx_rd_diff[i]; } @@ -1240,7 +1248,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd); } else { if (bsize >= BLOCK_8X8) { - if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize, ctx, best_rd); else @@ -1283,8 +1291,8 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) { if (!frame_is_intra_only(cm)) { FRAME_COUNTS *const counts = td->counts; const int inter_block = is_inter_block(mbmi); - const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id, - SEG_LVL_REF_FRAME); + const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id, + SEG_LVL_REF_FRAME); if (!seg_ref_active) { counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++; // If the segment reference feature is enabled we have only a single @@ -1309,7 +1317,7 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) { } } if (inter_block && - !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]]; if (bsize >= BLOCK_8X8) { const PREDICTION_MODE mode = mbmi->mode; @@ -1556,7 +1564,7 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } -const struct { +static const struct { int row; int col; } coord_lookup[16] = { @@ -2212,66 +2220,6 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, *max_block_size = max_size; } -static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, - MACROBLOCKD *const xd, - int mi_row, int mi_col, - BLOCK_SIZE *min_block_size, - BLOCK_SIZE *max_block_size) { - VP9_COMMON *const cm = &cpi->common; - MODE_INFO **mi_8x8 = xd->mi; - const int left_in_image = xd->left_available && mi_8x8[-1]; - const int above_in_image = xd->up_available && mi_8x8[-xd->mi_stride]; - int row8x8_remaining = tile->mi_row_end - mi_row; - int col8x8_remaining = tile->mi_col_end - mi_col; - int bh, bw; - BLOCK_SIZE min_size = BLOCK_32X32; - BLOCK_SIZE max_size = BLOCK_8X8; - int bsl = mi_width_log2_lookup[BLOCK_64X64]; - const int search_range_ctrl = (((mi_row + mi_col) >> bsl) + - get_chessboard_index(cm->current_video_frame)) & 0x1; - // Trap case where we do not have a prediction. - if (search_range_ctrl && - (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) { - int block; - MODE_INFO **mi; - BLOCK_SIZE sb_type; - - // Find the min and max partition sizes used in the left SB64. - if (left_in_image) { - MODE_INFO *cur_mi; - mi = &mi_8x8[-1]; - for (block = 0; block < MI_BLOCK_SIZE; ++block) { - cur_mi = mi[block * xd->mi_stride]; - sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0; - min_size = MIN(min_size, sb_type); - max_size = MAX(max_size, sb_type); - } - } - // Find the min and max partition sizes used in the above SB64. - if (above_in_image) { - mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE]; - for (block = 0; block < MI_BLOCK_SIZE; ++block) { - sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0; - min_size = MIN(min_size, sb_type); - max_size = MAX(max_size, sb_type); - } - } - - min_size = min_partition_size[min_size]; - max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining, - &bh, &bw); - min_size = MIN(min_size, max_size); - min_size = MAX(min_size, BLOCK_8X8); - max_size = MIN(max_size, BLOCK_32X32); - } else { - min_size = BLOCK_8X8; - max_size = BLOCK_32X32; - } - - *min_block_size = min_size; - *max_block_size = max_size; -} - // TODO(jingning) refactor functions setting partition search range static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize, @@ -2841,7 +2789,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); - seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP); + seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); } x->source_variance = UINT_MAX; @@ -2901,7 +2849,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { static int check_dual_ref_flags(VP9_COMP *cpi) { const int ref_flags = cpi->ref_frame_flags; - if (vp9_segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) { + if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) { return 0; } else { return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG) @@ -2937,8 +2885,7 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) { if (xd->lossless) return ONLY_4X4; if (cpi->common.frame_type == KEY_FRAME && - cpi->sf.use_nonrd_pick_mode && - cpi->sf.partition_search_type == VAR_BASED_PARTITION) + cpi->sf.use_nonrd_pick_mode) return ALLOW_16X16; if (cpi->sf.tx_size_search_method == USE_LARGESTALL) return ALLOW_32X32; @@ -2976,7 +2923,7 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, if (cm->frame_type == KEY_FRAME) hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx); - else if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + else if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize); else if (bsize >= BLOCK_8X8) vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, @@ -3591,7 +3538,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); - seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP); + seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); if (seg_skip) { partition_search_type = FIXED_PARTITION; } @@ -3624,15 +3571,26 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && xd->mi[0]->mbmi.segment_id) { - x->max_partition_size = BLOCK_64X64; + // Use lower max_partition_size for low resoultions. + if (cm->width <= 352 && cm->height <= 288) + x->max_partition_size = BLOCK_32X32; + else + x->max_partition_size = BLOCK_64X64; x->min_partition_size = BLOCK_8X8; nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, td->pc_root); } else { choose_partitioning(cpi, tile_info, x, mi_row, mi_col); - nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, - BLOCK_64X64, 1, &dummy_rdc, td->pc_root); + // TODO(marpan): Seems like nonrd_select_partition does not support + // 4x4 partition. Since 4x4 is used on key frame, use this switch + // for now. + if (cm->frame_type == KEY_FRAME) + nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rdc, td->pc_root); + else + nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rdc, td->pc_root); } break; @@ -3671,15 +3629,15 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { if (cm->use_highbitdepth) { switch (cm->bit_depth) { case VPX_BITS_8: - vp9_highbd_get16x16var(src, src_stride, last_src, last_stride, + vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); break; case VPX_BITS_10: - vp9_highbd_10_get16x16var(src, src_stride, last_src, last_stride, + vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); break; case VPX_BITS_12: - vp9_highbd_12_get16x16var(src, src_stride, last_src, last_stride, + vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); break; default: @@ -3688,11 +3646,11 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { return -1; } } else { - vp9_get16x16var(src, src_stride, last_src, last_stride, + vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); } #else - vp9_get16x16var(src, src_stride, last_src, last_stride, + vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); #endif // CONFIG_VP9_HIGHBITDEPTH var16->var = var16->sse - @@ -3778,9 +3736,13 @@ void vp9_init_tile_data(VP9_COMP *cpi) { TOKENEXTRA *pre_tok = cpi->tile_tok[0][0]; int tile_tok = 0; - if (cpi->tile_data == NULL) { + if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { + if (cpi->tile_data != NULL) + vpx_free(cpi->tile_data); CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data))); + cpi->allocated_tiles = tile_cols * tile_rows; + for (tile_row = 0; tile_row < tile_rows; ++tile_row) for (tile_col = 0; tile_col < tile_cols; ++tile_col) { TileDataEnc *tile_data = @@ -4149,8 +4111,8 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, MODE_INFO **mi_8x8 = xd->mi; MODE_INFO *mi = mi_8x8[0]; MB_MODE_INFO *mbmi = &mi->mbmi; - const int seg_skip = vp9_segfeature_active(&cm->seg, mbmi->segment_id, - SEG_LVL_SKIP); + const int seg_skip = segfeature_active(&cm->seg, mbmi->segment_id, + SEG_LVL_SKIP); const int mis = cm->mi_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h index 1acde0283e9..6aaa56463b0 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h @@ -40,7 +40,7 @@ void vp9_init_tile_data(struct VP9_COMP *cpi); void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row, int tile_col); -void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int q); +void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q); #ifdef __cplusplus } // extern "C" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c index 9a4e61ec882..2829365e533 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c @@ -13,10 +13,12 @@ #include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_encodemb.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c index af73fcbdcc3..22759983ffa 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c @@ -22,7 +22,7 @@ static struct vp9_token mv_class_encodings[MV_CLASSES]; static struct vp9_token mv_fp_encodings[MV_FP_SIZE]; static struct vp9_token mv_class0_encodings[CLASS0_SIZE]; -void vp9_entropy_mv_init() { +void vp9_entropy_mv_init(void) { vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree); vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree); vp9_tokens_from_tree(mv_class0_encodings, vp9_mv_class0_tree); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h index 0ae473749ab..e8ee5ab6641 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h @@ -18,7 +18,7 @@ extern "C" { #endif -void vp9_entropy_mv_init(); +void vp9_entropy_mv_init(void); void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w, nmv_context_counts *const counts); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c index a1018adb88f..d708b83197b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c @@ -17,6 +17,7 @@ #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vpx/internal/vpx_psnr.h" +#include "vpx_ports/mem.h" #include "vpx_ports/vpx_timer.h" #include "vp9/common/vp9_alloccommon.h" @@ -111,7 +112,7 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { // Mark all inactive blocks as active. Other segmentation features may be set // so memset cannot be used, instead only inactive blocks should be reset. -void vp9_suppress_active_map(VP9_COMP *cpi) { +static void suppress_active_map(VP9_COMP *cpi) { unsigned char *const seg_map = cpi->segmentation_map; int i; if (cpi->active_map.enabled || cpi->active_map.update) @@ -120,7 +121,7 @@ void vp9_suppress_active_map(VP9_COMP *cpi) { seg_map[i] = AM_SEGMENT_ID_ACTIVE; } -void vp9_apply_active_map(VP9_COMP *cpi) { +static void apply_active_map(VP9_COMP *cpi) { struct segmentation *const seg = &cpi->common.seg; unsigned char *const seg_map = cpi->segmentation_map; const unsigned char *const active_map = cpi->active_map.map; @@ -685,6 +686,29 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { vp9_setup_pc_tree(&cpi->common, &cpi->td); } +void vp9_new_framerate(VP9_COMP *cpi, double framerate) { + cpi->framerate = framerate < 0.1 ? 30 : framerate; + vp9_rc_update_framerate(cpi); +} + +static void set_tile_limits(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + + int min_log2_tile_cols, max_log2_tile_cols; + vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + + if (is_two_pass_svc(cpi) && + (cpi->svc.encode_empty_frame_state == ENCODING || + cpi->svc.number_spatial_layers > 1)) { + cm->log2_tile_cols = 0; + cm->log2_tile_rows = 0; + } else { + cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns, + min_log2_tile_cols, max_log2_tile_cols); + cm->log2_tile_rows = cpi->oxcf.tile_rows; + } +} + static void update_frame_size(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; @@ -693,6 +717,8 @@ static void update_frame_size(VP9_COMP *cpi) { vp9_init_context_buffers(cm); init_macroblockd(cm, xd); + set_tile_limits(cpi); + if (is_two_pass_svc(cpi)) { if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer, cm->width, cm->height, @@ -707,27 +733,6 @@ static void update_frame_size(VP9_COMP *cpi) { } } -void vp9_new_framerate(VP9_COMP *cpi, double framerate) { - cpi->framerate = framerate < 0.1 ? 30 : framerate; - vp9_rc_update_framerate(cpi); -} - -static void set_tile_limits(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - - int min_log2_tile_cols, max_log2_tile_cols; - vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); - - if (is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING) { - cm->log2_tile_cols = 0; - cm->log2_tile_rows = 0; - } else { - cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns, - min_log2_tile_cols, max_log2_tile_cols); - cm->log2_tile_rows = cpi->oxcf.tile_rows; - } -} - static void init_buffer_indices(VP9_COMP *cpi) { cpi->lst_fb_idx = 0; cpi->gld_fb_idx = 1; @@ -751,6 +756,8 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { cm->height = oxcf->height; vp9_alloc_compressor_data(cpi); + cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode; + // Single thread case: use counts in common. cpi->td.counts = &cm->counts; @@ -995,7 +1002,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8, vpx_highbd_sad32x16_avg_bits8, - vp9_highbd_variance32x16, + vpx_highbd_8_variance32x16, vp9_highbd_sub_pixel_variance32x16, vp9_highbd_sub_pixel_avg_variance32x16, NULL, @@ -1005,7 +1012,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8, vpx_highbd_sad16x32_avg_bits8, - vp9_highbd_variance16x32, + vpx_highbd_8_variance16x32, vp9_highbd_sub_pixel_variance16x32, vp9_highbd_sub_pixel_avg_variance16x32, NULL, @@ -1015,7 +1022,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8, vpx_highbd_sad64x32_avg_bits8, - vp9_highbd_variance64x32, + vpx_highbd_8_variance64x32, vp9_highbd_sub_pixel_variance64x32, vp9_highbd_sub_pixel_avg_variance64x32, NULL, @@ -1025,7 +1032,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8, vpx_highbd_sad32x64_avg_bits8, - vp9_highbd_variance32x64, + vpx_highbd_8_variance32x64, vp9_highbd_sub_pixel_variance32x64, vp9_highbd_sub_pixel_avg_variance32x64, NULL, @@ -1035,7 +1042,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8, vpx_highbd_sad32x32_avg_bits8, - vp9_highbd_variance32x32, + vpx_highbd_8_variance32x32, vp9_highbd_sub_pixel_variance32x32, vp9_highbd_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits8, @@ -1045,7 +1052,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8, vpx_highbd_sad64x64_avg_bits8, - vp9_highbd_variance64x64, + vpx_highbd_8_variance64x64, vp9_highbd_sub_pixel_variance64x64, vp9_highbd_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits8, @@ -1055,7 +1062,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8, vpx_highbd_sad16x16_avg_bits8, - vp9_highbd_variance16x16, + vpx_highbd_8_variance16x16, vp9_highbd_sub_pixel_variance16x16, vp9_highbd_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits8, @@ -1065,7 +1072,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8, vpx_highbd_sad16x8_avg_bits8, - vp9_highbd_variance16x8, + vpx_highbd_8_variance16x8, vp9_highbd_sub_pixel_variance16x8, vp9_highbd_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits8, @@ -1075,7 +1082,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8, vpx_highbd_sad8x16_avg_bits8, - vp9_highbd_variance8x16, + vpx_highbd_8_variance8x16, vp9_highbd_sub_pixel_variance8x16, vp9_highbd_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits8, @@ -1085,7 +1092,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8, - vp9_highbd_variance8x8, + vpx_highbd_8_variance8x8, vp9_highbd_sub_pixel_variance8x8, vp9_highbd_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits8, @@ -1095,7 +1102,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8, - vp9_highbd_variance8x4, + vpx_highbd_8_variance8x4, vp9_highbd_sub_pixel_variance8x4, vp9_highbd_sub_pixel_avg_variance8x4, NULL, @@ -1105,7 +1112,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8, - vp9_highbd_variance4x8, + vpx_highbd_8_variance4x8, vp9_highbd_sub_pixel_variance4x8, vp9_highbd_sub_pixel_avg_variance4x8, NULL, @@ -1115,7 +1122,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8, - vp9_highbd_variance4x4, + vpx_highbd_8_variance4x4, vp9_highbd_sub_pixel_variance4x4, vp9_highbd_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits8, @@ -1127,7 +1134,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10, vpx_highbd_sad32x16_avg_bits10, - vp9_highbd_10_variance32x16, + vpx_highbd_10_variance32x16, vp9_highbd_10_sub_pixel_variance32x16, vp9_highbd_10_sub_pixel_avg_variance32x16, NULL, @@ -1137,7 +1144,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10, vpx_highbd_sad16x32_avg_bits10, - vp9_highbd_10_variance16x32, + vpx_highbd_10_variance16x32, vp9_highbd_10_sub_pixel_variance16x32, vp9_highbd_10_sub_pixel_avg_variance16x32, NULL, @@ -1147,7 +1154,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10, vpx_highbd_sad64x32_avg_bits10, - vp9_highbd_10_variance64x32, + vpx_highbd_10_variance64x32, vp9_highbd_10_sub_pixel_variance64x32, vp9_highbd_10_sub_pixel_avg_variance64x32, NULL, @@ -1157,7 +1164,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10, vpx_highbd_sad32x64_avg_bits10, - vp9_highbd_10_variance32x64, + vpx_highbd_10_variance32x64, vp9_highbd_10_sub_pixel_variance32x64, vp9_highbd_10_sub_pixel_avg_variance32x64, NULL, @@ -1167,7 +1174,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10, vpx_highbd_sad32x32_avg_bits10, - vp9_highbd_10_variance32x32, + vpx_highbd_10_variance32x32, vp9_highbd_10_sub_pixel_variance32x32, vp9_highbd_10_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits10, @@ -1177,7 +1184,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10, vpx_highbd_sad64x64_avg_bits10, - vp9_highbd_10_variance64x64, + vpx_highbd_10_variance64x64, vp9_highbd_10_sub_pixel_variance64x64, vp9_highbd_10_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits10, @@ -1187,7 +1194,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10, vpx_highbd_sad16x16_avg_bits10, - vp9_highbd_10_variance16x16, + vpx_highbd_10_variance16x16, vp9_highbd_10_sub_pixel_variance16x16, vp9_highbd_10_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits10, @@ -1197,7 +1204,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10, vpx_highbd_sad16x8_avg_bits10, - vp9_highbd_10_variance16x8, + vpx_highbd_10_variance16x8, vp9_highbd_10_sub_pixel_variance16x8, vp9_highbd_10_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits10, @@ -1207,7 +1214,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10, vpx_highbd_sad8x16_avg_bits10, - vp9_highbd_10_variance8x16, + vpx_highbd_10_variance8x16, vp9_highbd_10_sub_pixel_variance8x16, vp9_highbd_10_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits10, @@ -1217,7 +1224,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad8x8_avg_bits10, - vp9_highbd_10_variance8x8, + vpx_highbd_10_variance8x8, vp9_highbd_10_sub_pixel_variance8x8, vp9_highbd_10_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits10, @@ -1227,7 +1234,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad8x4_avg_bits10, - vp9_highbd_10_variance8x4, + vpx_highbd_10_variance8x4, vp9_highbd_10_sub_pixel_variance8x4, vp9_highbd_10_sub_pixel_avg_variance8x4, NULL, @@ -1237,7 +1244,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad4x8_avg_bits10, - vp9_highbd_10_variance4x8, + vpx_highbd_10_variance4x8, vp9_highbd_10_sub_pixel_variance4x8, vp9_highbd_10_sub_pixel_avg_variance4x8, NULL, @@ -1247,7 +1254,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad4x4_avg_bits10, - vp9_highbd_10_variance4x4, + vpx_highbd_10_variance4x4, vp9_highbd_10_sub_pixel_variance4x4, vp9_highbd_10_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits10, @@ -1259,7 +1266,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, vpx_highbd_sad32x16_avg_bits12, - vp9_highbd_12_variance32x16, + vpx_highbd_12_variance32x16, vp9_highbd_12_sub_pixel_variance32x16, vp9_highbd_12_sub_pixel_avg_variance32x16, NULL, @@ -1269,7 +1276,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12, vpx_highbd_sad16x32_avg_bits12, - vp9_highbd_12_variance16x32, + vpx_highbd_12_variance16x32, vp9_highbd_12_sub_pixel_variance16x32, vp9_highbd_12_sub_pixel_avg_variance16x32, NULL, @@ -1279,7 +1286,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12, vpx_highbd_sad64x32_avg_bits12, - vp9_highbd_12_variance64x32, + vpx_highbd_12_variance64x32, vp9_highbd_12_sub_pixel_variance64x32, vp9_highbd_12_sub_pixel_avg_variance64x32, NULL, @@ -1289,7 +1296,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12, vpx_highbd_sad32x64_avg_bits12, - vp9_highbd_12_variance32x64, + vpx_highbd_12_variance32x64, vp9_highbd_12_sub_pixel_variance32x64, vp9_highbd_12_sub_pixel_avg_variance32x64, NULL, @@ -1299,7 +1306,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12, vpx_highbd_sad32x32_avg_bits12, - vp9_highbd_12_variance32x32, + vpx_highbd_12_variance32x32, vp9_highbd_12_sub_pixel_variance32x32, vp9_highbd_12_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits12, @@ -1309,7 +1316,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12, vpx_highbd_sad64x64_avg_bits12, - vp9_highbd_12_variance64x64, + vpx_highbd_12_variance64x64, vp9_highbd_12_sub_pixel_variance64x64, vp9_highbd_12_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits12, @@ -1319,7 +1326,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12, vpx_highbd_sad16x16_avg_bits12, - vp9_highbd_12_variance16x16, + vpx_highbd_12_variance16x16, vp9_highbd_12_sub_pixel_variance16x16, vp9_highbd_12_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits12, @@ -1329,7 +1336,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12, vpx_highbd_sad16x8_avg_bits12, - vp9_highbd_12_variance16x8, + vpx_highbd_12_variance16x8, vp9_highbd_12_sub_pixel_variance16x8, vp9_highbd_12_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits12, @@ -1339,7 +1346,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12, vpx_highbd_sad8x16_avg_bits12, - vp9_highbd_12_variance8x16, + vpx_highbd_12_variance8x16, vp9_highbd_12_sub_pixel_variance8x16, vp9_highbd_12_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits12, @@ -1349,7 +1356,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad8x8_avg_bits12, - vp9_highbd_12_variance8x8, + vpx_highbd_12_variance8x8, vp9_highbd_12_sub_pixel_variance8x8, vp9_highbd_12_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits12, @@ -1359,7 +1366,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad8x4_avg_bits12, - vp9_highbd_12_variance8x4, + vpx_highbd_12_variance8x4, vp9_highbd_12_sub_pixel_variance8x4, vp9_highbd_12_sub_pixel_avg_variance8x4, NULL, @@ -1369,7 +1376,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad4x8_avg_bits12, - vp9_highbd_12_variance4x8, + vpx_highbd_12_variance4x8, vp9_highbd_12_sub_pixel_variance4x8, vp9_highbd_12_sub_pixel_avg_variance4x8, NULL, @@ -1379,7 +1386,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad4x4_avg_bits12, - vp9_highbd_12_variance4x4, + vpx_highbd_12_variance4x4, vp9_highbd_12_sub_pixel_variance4x4, vp9_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits12, @@ -1589,6 +1596,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, sizeof(*cm->frame_contexts))); cpi->use_svc = 0; + cpi->resize_state = 0; + cpi->resize_avg_qp = 0; + cpi->resize_buffer_underflow = 0; cpi->common.buffer_pool = pool; init_config(cpi, oxcf); @@ -1802,61 +1812,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->fn_ptr[BT].sdx4df = SDX4DF; BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, - vp9_variance32x16, vp9_sub_pixel_variance32x16, + vpx_variance32x16, vp9_sub_pixel_variance32x16, vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d) BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, - vp9_variance16x32, vp9_sub_pixel_variance16x32, + vpx_variance16x32, vp9_sub_pixel_variance16x32, vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d) BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, - vp9_variance64x32, vp9_sub_pixel_variance64x32, + vpx_variance64x32, vp9_sub_pixel_variance64x32, vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d) BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, - vp9_variance32x64, vp9_sub_pixel_variance32x64, + vpx_variance32x64, vp9_sub_pixel_variance32x64, vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d) BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, - vp9_variance32x32, vp9_sub_pixel_variance32x32, + vpx_variance32x32, vp9_sub_pixel_variance32x32, vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8, vpx_sad32x32x4d) BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, - vp9_variance64x64, vp9_sub_pixel_variance64x64, + vpx_variance64x64, vp9_sub_pixel_variance64x64, vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8, vpx_sad64x64x4d) BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, - vp9_variance16x16, vp9_sub_pixel_variance16x16, + vpx_variance16x16, vp9_sub_pixel_variance16x16, vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8, vpx_sad16x16x4d) BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, - vp9_variance16x8, vp9_sub_pixel_variance16x8, + vpx_variance16x8, vp9_sub_pixel_variance16x8, vp9_sub_pixel_avg_variance16x8, vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d) BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, - vp9_variance8x16, vp9_sub_pixel_variance8x16, + vpx_variance8x16, vp9_sub_pixel_variance8x16, vp9_sub_pixel_avg_variance8x16, vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d) BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, - vp9_variance8x8, vp9_sub_pixel_variance8x8, + vpx_variance8x8, vp9_sub_pixel_variance8x8, vp9_sub_pixel_avg_variance8x8, vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d) BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, - vp9_variance8x4, vp9_sub_pixel_variance8x4, + vpx_variance8x4, vp9_sub_pixel_variance8x4, vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d) BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, - vp9_variance4x8, vp9_sub_pixel_variance4x8, + vpx_variance4x8, vp9_sub_pixel_variance4x8, vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d) BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, - vp9_variance4x4, vp9_sub_pixel_variance4x4, + vpx_variance4x4, vp9_sub_pixel_variance4x4, vp9_sub_pixel_avg_variance4x4, vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d) @@ -2049,6 +2059,65 @@ void vp9_remove_compressor(VP9_COMP *cpi) { #endif } +/* TODO(yaowu): The block_variance calls the unoptimized versions of variance() + * and highbd_8_variance(). It should not. + */ +static void encoder_variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, uint64_t *sse, + uint64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, + unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, + &sse_long, &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { @@ -2060,15 +2129,15 @@ static int64_t get_sse(const uint8_t *a, int a_stride, int x, y; if (dw > 0) { - variance(&a[width - dw], a_stride, &b[width - dw], b_stride, - dw, height, &sse, &sum); + encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height, &sse, &sum); total_sse += sse; } if (dh > 0) { - variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); + encoder_variance(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh, &sse, &sum); total_sse += sse; } @@ -2076,7 +2145,7 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *pa = a; const uint8_t *pb = b; for (x = 0; x < width / 16; ++x) { - vp9_mse16x16(pa, a_stride, pb, b_stride, &sse); + vpx_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; pa += 16; @@ -2121,21 +2190,22 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, unsigned int sse = 0; int sum = 0; if (dw > 0) { - highbd_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, - dw, height, &sse, &sum); + encoder_highbd_8_variance(&a[width - dw], a_stride, + &b[width - dw], b_stride, + dw, height, &sse, &sum); total_sse += sse; } if (dh > 0) { - highbd_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); + encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh, &sse, &sum); total_sse += sse; } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; for (x = 0; x < width / 16; ++x) { - vp9_highbd_mse16x16(pa, a_stride, pb, b_stride, &sse); + vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; pa += 16; pb += 16; @@ -2260,8 +2330,9 @@ static void generate_psnr_packet(VP9_COMP *cpi) { pkt.data.psnr.psnr[i] = psnr.psnr[i]; } pkt.kind = VPX_CODEC_PSNR_PKT; - if (is_two_pass_svc(cpi)) - cpi->svc.layer_context[cpi->svc.spatial_layer_id].psnr_pkt = pkt.data.psnr; + if (cpi->use_svc) + cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers].psnr_pkt = pkt.data.psnr; else vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt); } @@ -2711,7 +2782,10 @@ void vp9_scale_references(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { const int new_fb = get_free_fb(cm); - RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb]; + RefCntBuffer *new_fb_ptr = NULL; + if (cm->new_fb_idx == INVALID_IDX) + return; + new_fb_ptr = &pool->frame_bufs[new_fb]; cm->cur_frame = &pool->frame_bufs[new_fb]; vp9_realloc_frame_buffer(&pool->frame_bufs[new_fb].buf, cm->width, cm->height, @@ -2723,7 +2797,10 @@ void vp9_scale_references(VP9_COMP *cpi) { #else if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { const int new_fb = get_free_fb(cm); - RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb]; + RefCntBuffer *new_fb_ptr = NULL; + if (cm->new_fb_idx == INVALID_IDX) + return; + new_fb_ptr = &pool->frame_bufs[new_fb]; vp9_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -2792,19 +2869,25 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); if (cpi->twopass.total_left_stats.coded_error != 0.0) - fprintf(f, "%10u %dx%d %10d %10d %10d %10d" - "%10"PRId64" %10"PRId64" %10"PRId64" %10"PRId64" %10d " - "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" + fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d" + "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" " + "%10"PRId64" %10"PRId64" %10d " + "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" "%6d %6d %5d %5d %5d " "%10"PRId64" %10.3lf" - "%10lf %8u %10"PRId64" %10d %10d\n", + "%10lf %8u %10"PRId64" %10d %10d %10d\n", cpi->common.current_video_frame, cm->width, cm->height, + cpi->rc.source_alt_ref_pending, + cpi->rc.source_alt_ref_active, cpi->rc.this_frame_target, cpi->rc.projected_frame_size, cpi->rc.projected_frame_size / cpi->common.MBs, (cpi->rc.projected_frame_size - cpi->rc.this_frame_target), cpi->rc.vbr_bits_off_target, + cpi->rc.vbr_bits_off_target_fast, + cpi->twopass.extend_minq, + cpi->twopass.extend_minq_fast, cpi->rc.total_target_vs_actual, (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target), cpi->rc.total_actual_bits, cm->base_qindex, @@ -2821,7 +2904,8 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { cpi->twopass.bits_left / (1 + cpi->twopass.total_left_stats.coded_error), cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost, - cpi->twopass.kf_zeromotion_pct); + cpi->twopass.kf_zeromotion_pct, + cpi->twopass.fr_content_type); fclose(f); @@ -2933,7 +3017,7 @@ static void init_motion_estimation(VP9_COMP *cpi) { } } -void set_frame_size(VP9_COMP *cpi) { +static void set_frame_size(VP9_COMP *cpi) { int ref_frame; VP9_COMMON *const cm = &cpi->common; VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -2951,6 +3035,31 @@ void set_frame_size(VP9_COMP *cpi) { oxcf->scaled_frame_height); } + if (oxcf->pass == 0 && + oxcf->rc_mode == VPX_CBR && + !cpi->use_svc && + oxcf->resize_mode == RESIZE_DYNAMIC) { + if (cpi->resize_state == 1) { + oxcf->scaled_frame_width = + (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den; + oxcf->scaled_frame_height = + (cm->height * cpi->resize_scale_num) /cpi->resize_scale_den; + } else if (cpi->resize_state == -1) { + // Go back up to original size. + oxcf->scaled_frame_width = oxcf->width; + oxcf->scaled_frame_height = oxcf->height; + } + if (cpi->resize_state != 0) { + // There has been a change in frame size. + vp9_set_size_literal(cpi, + oxcf->scaled_frame_width, + oxcf->scaled_frame_height); + + // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. + set_mv_search_params(cpi); + } + } + if ((oxcf->pass == 2) && (!cpi->use_svc || (is_two_pass_svc(cpi) && @@ -3026,11 +3135,11 @@ static void encode_without_recode_loop(VP9_COMP *cpi) { set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); vp9_set_quantizer(cm, q); - vp9_set_vbp_thresholds(cpi, q); + vp9_set_variance_partition_thresholds(cpi, q); setup_frame(cpi); - vp9_suppress_active_map(cpi); + suppress_active_map(cpi); // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. if (cpi->oxcf.aq_mode == VARIANCE_AQ) { @@ -3040,7 +3149,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi) { } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { vp9_cyclic_refresh_setup(cpi); } - vp9_apply_active_map(cpi); + apply_active_map(cpi); // transform / motion compensation build reconstruction frame vp9_encode_frame(cpi); @@ -3394,7 +3503,7 @@ static void set_arf_sign_bias(VP9_COMP *cpi) { cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias; } -int setup_interp_filter_search_mask(VP9_COMP *cpi) { +static int setup_interp_filter_search_mask(VP9_COMP *cpi) { INTERP_FILTER ifilter; int ref_total[MAX_REF_FRAMES] = {0}; MV_REFERENCE_FRAME ref; @@ -3471,34 +3580,41 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } } if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) { - // Use the last frame context for the empty frame. + // Use context 0 for intra only empty frame, but the last frame context + // for other empty frames. + if (cpi->svc.encode_empty_frame_state == ENCODING) { + if (cpi->svc.encode_intra_empty_frame != 0) + cm->frame_context_idx = 0; + else + cm->frame_context_idx = FRAME_CONTEXTS - 1; + } else { cm->frame_context_idx = - (cpi->svc.encode_empty_frame_state == ENCODING) ? FRAME_CONTEXTS - 1 : cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id; + } + + cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode; // The probs will be updated based on the frame type of its previous // frame if frame_parallel_decoding_mode is 0. The type may vary for // the frame after a key frame in base layer since we may drop enhancement // layers. So set frame_parallel_decoding_mode to 1 in this case. - if (cpi->svc.number_temporal_layers == 1) { - if (cpi->svc.spatial_layer_id == 0 && - cpi->svc.layer_context[0].last_frame_type == KEY_FRAME) - cm->frame_parallel_decoding_mode = 1; - else - cm->frame_parallel_decoding_mode = 0; - } else if (cpi->svc.spatial_layer_id == 0) { - // Find the 2nd frame in temporal base layer and 1st frame in temporal - // enhancement layers from the key frame. - int i; - for (i = 0; i < cpi->svc.number_temporal_layers; ++i) { - if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) { + if (cm->frame_parallel_decoding_mode == 0) { + if (cpi->svc.number_temporal_layers == 1) { + if (cpi->svc.spatial_layer_id == 0 && + cpi->svc.layer_context[0].last_frame_type == KEY_FRAME) cm->frame_parallel_decoding_mode = 1; - break; + } else if (cpi->svc.spatial_layer_id == 0) { + // Find the 2nd frame in temporal base layer and 1st frame in temporal + // enhancement layers from the key frame. + int i; + for (i = 0; i < cpi->svc.number_temporal_layers; ++i) { + if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) { + cm->frame_parallel_decoding_mode = 1; + break; + } } } - if (i == cpi->svc.number_temporal_layers) - cm->frame_parallel_decoding_mode = 0; } } @@ -3643,9 +3759,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } cm->prev_frame = cm->cur_frame; - if (is_two_pass_svc(cpi)) - cpi->svc.layer_context[cpi->svc.spatial_layer_id].last_frame_type = - cm->frame_type; + if (cpi->use_svc) + cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id].last_frame_type = + cm->frame_type; } static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest, @@ -3788,8 +3906,8 @@ static int frame_is_reference(const VP9_COMP *cpi) { cm->seg.update_data; } -void adjust_frame_rate(VP9_COMP *cpi, - const struct lookahead_entry *source) { +static void adjust_frame_rate(VP9_COMP *cpi, + const struct lookahead_entry *source) { int64_t this_duration; int step = 0; @@ -3872,15 +3990,16 @@ static void check_src_altref(VP9_COMP *cpi, extern double vp9_get_blockiness(const unsigned char *img1, int img1_pitch, const unsigned char *img2, int img2_pitch, int width, int height); -#endif -void adjust_image_stat(double y, double u, double v, double all, ImageStat *s) { +static void adjust_image_stat(double y, double u, double v, double all, + ImageStat *s) { s->stat[Y] += y; s->stat[U] += u; s->stat[V] += v; s->stat[ALL] += all; s->worst = MIN(s->worst, all); } +#endif // CONFIG_INTERNAL_STATS int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, size_t *size, uint8_t *dest, @@ -3905,6 +4024,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif if (oxcf->pass == 2) vp9_restore_layer_context(cpi); + } else if (is_one_pass_cbr_svc(cpi)) { + vp9_one_pass_cbr_svc_start_layer(cpi); } vpx_usec_timer_start(&cmptimer); @@ -3923,9 +4044,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Normal defaults cm->reset_frame_context = 0; cm->refresh_frame_context = 1; - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_alt_ref_frame = 0; + if (!is_one_pass_cbr_svc(cpi)) { + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + } // Should we encode an arf frame. arf_src_index = get_arf_src_index(cpi); @@ -3962,6 +4085,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } cm->show_frame = 0; + cm->intra_only = 0; cpi->refresh_alt_ref_frame = 1; cpi->refresh_golden_frame = 0; cpi->refresh_last_frame = 0; @@ -3980,12 +4104,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } // Read in the source frame. -#if CONFIG_SPATIAL_SVC - if (is_two_pass_svc(cpi)) + if (cpi->use_svc) source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush); else -#endif source = vp9_lookahead_pop(cpi->lookahead, flush); + if (source != NULL) { cm->show_frame = 1; cm->intra_only = 0; @@ -4034,8 +4157,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, adjust_frame_rate(cpi, source); } - if (cpi->svc.number_temporal_layers > 1 && - oxcf->rc_mode == VPX_CBR) { + if (is_one_pass_cbr_svc(cpi)) { vp9_update_temporal_layer_framerate(cpi); vp9_restore_layer_context(cpi); } @@ -4071,7 +4193,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, (is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state != ENCODING))) { vp9_rc_get_second_pass_params(cpi); - } else { + } else if (oxcf->pass == 1) { set_frame_size(cpi); } @@ -4117,11 +4239,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } // Save layer specific state. - if ((cpi->svc.number_temporal_layers > 1 && - oxcf->rc_mode == VPX_CBR) || - ((cpi->svc.number_temporal_layers > 1 || - cpi->svc.number_spatial_layers > 1) && - oxcf->pass == 2)) { + if (is_one_pass_cbr_svc(cpi) || + ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + oxcf->pass == 2)) { vp9_save_layer_context(cpi); } @@ -4180,7 +4301,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, vp9_clear_system_state(); #if CONFIG_VP9_HIGHBITDEPTH - calc_highbd_psnr(orig, pp, &psnr, cpi->td.mb.e_mbd.bd, + calc_highbd_psnr(orig, pp, &psnr2, cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth); #else calc_psnr(orig, pp, &psnr2); @@ -4231,31 +4352,38 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } } if (cpi->b_calculate_blockiness) { - double frame_blockiness = vp9_get_blockiness( - cpi->Source->y_buffer, cpi->Source->y_stride, - cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride, - cpi->Source->y_width, cpi->Source->y_height); - cpi->worst_blockiness = MAX(cpi->worst_blockiness, frame_blockiness); - cpi->total_blockiness += frame_blockiness; +#if CONFIG_VP9_HIGHBITDEPTH + if (!cm->use_highbitdepth) +#endif + { + double frame_blockiness = vp9_get_blockiness( + cpi->Source->y_buffer, cpi->Source->y_stride, + cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride, + cpi->Source->y_width, cpi->Source->y_height); + cpi->worst_blockiness = MAX(cpi->worst_blockiness, frame_blockiness); + cpi->total_blockiness += frame_blockiness; + } } if (cpi->b_calculate_consistency) { - double this_inconsistency = vp9_get_ssim_metrics( - cpi->Source->y_buffer, cpi->Source->y_stride, - cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride, - cpi->Source->y_width, cpi->Source->y_height, cpi->ssim_vars, - &cpi->metrics, 1); - - const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1); - - - double consistency = vpx_sse_to_psnr(samples, peak, +#if CONFIG_VP9_HIGHBITDEPTH + if (!cm->use_highbitdepth) +#endif + { + double this_inconsistency = vp9_get_ssim_metrics( + cpi->Source->y_buffer, cpi->Source->y_stride, + cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride, + cpi->Source->y_width, cpi->Source->y_height, cpi->ssim_vars, + &cpi->metrics, 1); + + const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1); + double consistency = vpx_sse_to_psnr(samples, peak, (double)cpi->total_inconsistency); - - if (consistency > 0.0) - cpi->worst_consistency = MIN(cpi->worst_consistency, - consistency); - cpi->total_inconsistency += this_inconsistency; + if (consistency > 0.0) + cpi->worst_consistency = MIN(cpi->worst_consistency, + consistency); + cpi->total_inconsistency += this_inconsistency; + } } if (cpi->b_calculate_ssimg) { @@ -4273,6 +4401,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif // CONFIG_VP9_HIGHBITDEPTH adjust_image_stat(y, u, v, frame_all, &cpi->ssimg); } +#if CONFIG_VP9_HIGHBITDEPTH + if (!cm->use_highbitdepth) +#endif { double y, u, v, frame_all; frame_all = vp9_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u, @@ -4280,6 +4411,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, adjust_image_stat(y, u, v, frame_all, &cpi->fastssim); /* TODO(JBB): add 10/12 bit support */ } +#if CONFIG_VP9_HIGHBITDEPTH + if (!cm->use_highbitdepth) +#endif { double y, u, v, frame_all; frame_all = vp9_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v); @@ -4291,8 +4425,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif if (is_two_pass_svc(cpi)) { - if (cpi->svc.encode_empty_frame_state == ENCODING) + if (cpi->svc.encode_empty_frame_state == ENCODING) { cpi->svc.encode_empty_frame_state = ENCODED; + cpi->svc.encode_intra_empty_frame = 0; + } if (cm->show_frame) { ++cpi->svc.spatial_layer_to_encode; @@ -4302,6 +4438,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // May need the empty frame after an visible frame. cpi->svc.encode_empty_frame_state = NEED_TO_ENCODE; } + } else if (is_one_pass_cbr_svc(cpi)) { + if (cm->show_frame) { + ++cpi->svc.spatial_layer_to_encode; + if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) + cpi->svc.spatial_layer_to_encode = 0; + } } return 0; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h index 41f1c13d493..2b0da103ffe 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h @@ -194,10 +194,10 @@ typedef struct VP9EncoderConfig { int ss_number_layers; // Number of spatial layers. int ts_number_layers; // Number of temporal layers. // Bitrate allocation for spatial layers. + int layer_target_bitrate[VPX_MAX_LAYERS]; int ss_target_bitrate[VPX_SS_MAX_LAYERS]; int ss_enable_auto_arf[VPX_SS_MAX_LAYERS]; // Bitrate allocation (CBR mode) and framerate factor, for temporal layers. - int ts_target_bitrate[VPX_TS_MAX_LAYERS]; int ts_rate_decimator[VPX_TS_MAX_LAYERS]; int enable_auto_arf; @@ -237,6 +237,7 @@ typedef struct VP9EncoderConfig { int use_highbitdepth; #endif vpx_color_space_t color_space; + VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode; } VP9EncoderConfig; static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { @@ -305,6 +306,7 @@ typedef struct VP9_COMP { YV12_BUFFER_CONFIG scaled_last_source; TileDataEnc *tile_data; + int allocated_tiles; // Keep track of memory allocated for tiles. // For a still frame, this flag is set to 1 to skip partition search. int partition_search_skippable_frame; @@ -477,6 +479,12 @@ typedef struct VP9_COMP { #endif int resize_pending; + int resize_state; + int resize_scale_num; + int resize_scale_den; + int resize_avg_qp; + int resize_buffer_underflow; + int resize_count; // VAR_BASED_PARTITION thresholds // 0 - threshold_64x64; 1 - threshold_32x32; @@ -611,9 +619,11 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) { - return cpi->use_svc && - ((cpi->svc.number_spatial_layers > 1) || - (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.pass != 0)); + return cpi->use_svc && cpi->oxcf.pass != 0; +} + +static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) { + return (cpi->use_svc && cpi->oxcf.pass == 0); } static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { @@ -642,6 +652,8 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) { void vp9_new_framerate(VP9_COMP *cpi, double framerate); +#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) + #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c index 8700ccdaecd..4ae3fbc5409 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c @@ -54,6 +54,18 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { return 0; } +static int get_max_tile_cols(VP9_COMP *cpi) { + const int aligned_width = ALIGN_POWER_OF_TWO(cpi->oxcf.width, MI_SIZE_LOG2); + int mi_cols = aligned_width >> MI_SIZE_LOG2; + int min_log2_tile_cols, max_log2_tile_cols; + int log2_tile_cols; + + vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + log2_tile_cols = clamp(cpi->oxcf.tile_columns, + min_log2_tile_cols, max_log2_tile_cols); + return (1 << log2_tile_cols); +} + void vp9_encode_tiles_mt(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; @@ -65,20 +77,30 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { // Only run once to create threads and allocate thread data. if (cpi->num_workers == 0) { + int allocated_workers = num_workers; + + // While using SVC, we need to allocate threads according to the highest + // resolution. + if (cpi->use_svc) { + int max_tile_cols = get_max_tile_cols(cpi); + allocated_workers = MIN(cpi->oxcf.max_threads, max_tile_cols); + } + CHECK_MEM_ERROR(cm, cpi->workers, - vpx_malloc(num_workers * sizeof(*cpi->workers))); + vpx_malloc(allocated_workers * sizeof(*cpi->workers))); CHECK_MEM_ERROR(cm, cpi->tile_thr_data, - vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data))); + vpx_calloc(allocated_workers, + sizeof(*cpi->tile_thr_data))); - for (i = 0; i < num_workers; i++) { + for (i = 0; i < allocated_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; EncWorkerData *thread_data = &cpi->tile_thr_data[i]; ++cpi->num_workers; winterface->init(worker); - if (i < num_workers - 1) { + if (i < allocated_workers - 1) { thread_data->cpi = cpi; // Allocate thread data. @@ -154,7 +176,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { // Set the starting tile for each thread. thread_data->start = i; - if (i == num_workers - 1) + if (i == cpi->num_workers - 1) winterface->execute(worker); else winterface->launch(worker); @@ -171,7 +193,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; // Accumulate counters. - if (i < num_workers - 1) { + if (i < cpi->num_workers - 1) { vp9_accumulate_frame_counts(cm, thread_data->td->counts, 0); accumulate_rd_opt(&cpi->td, thread_data->td); } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c index 96f3598b1dc..6e1ed365dab 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c @@ -9,6 +9,7 @@ */ #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/encoder/vp9_extend.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c index 9752668b15d..3d7843ea731 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c @@ -12,9 +12,11 @@ #include <math.h> #include <stdio.h> +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vpx_scale/vpx_scale.h" #include "vpx_scale/yv12config.h" @@ -111,8 +113,8 @@ static void output_stats(FIRSTPASS_STATS *stats, fpfile = fopen("firstpass.stt", "a"); fprintf(fpfile, "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf" - "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" - "%12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n", + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n", stats->frame, stats->weight, stats->intra_error, @@ -122,6 +124,8 @@ static void output_stats(FIRSTPASS_STATS *stats, stats->pcnt_motion, stats->pcnt_second_ref, stats->pcnt_neutral, + stats->intra_skip_pct, + stats->inactive_zone_rows, stats->MVr, stats->mvr_abs, stats->MVc, @@ -158,7 +162,9 @@ static void zero_stats(FIRSTPASS_STATS *section) { section->pcnt_motion = 0.0; section->pcnt_second_ref = 0.0; section->pcnt_neutral = 0.0; - section->MVr = 0.0; + section->intra_skip_pct = 0.0; + section->inactive_zone_rows = 0.0; + section->MVr = 0.0; section->mvr_abs = 0.0; section->MVc = 0.0; section->mvc_abs = 0.0; @@ -183,7 +189,9 @@ static void accumulate_stats(FIRSTPASS_STATS *section, section->pcnt_motion += frame->pcnt_motion; section->pcnt_second_ref += frame->pcnt_second_ref; section->pcnt_neutral += frame->pcnt_neutral; - section->MVr += frame->MVr; + section->intra_skip_pct += frame->intra_skip_pct; + section->inactive_zone_rows += frame->inactive_zone_rows; + section->MVr += frame->MVr; section->mvr_abs += frame->mvr_abs; section->MVc += frame->MVc; section->mvc_abs += frame->mvc_abs; @@ -206,7 +214,9 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->pcnt_motion -= frame->pcnt_motion; section->pcnt_second_ref -= frame->pcnt_second_ref; section->pcnt_neutral -= frame->pcnt_neutral; - section->MVr -= frame->MVr; + section->intra_skip_pct -= frame->intra_skip_pct; + section->inactive_zone_rows -= frame->inactive_zone_rows; + section->MVr -= frame->MVr; section->mvr_abs -= frame->mvr_abs; section->MVc -= frame->MVc; section->mvc_abs -= frame->mvc_abs; @@ -266,13 +276,13 @@ void vp9_end_first_pass(VP9_COMP *cpi) { static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { switch (bsize) { case BLOCK_8X8: - return vp9_mse8x8; + return vpx_mse8x8; case BLOCK_16X8: - return vp9_mse16x8; + return vpx_mse16x8; case BLOCK_8X16: - return vp9_mse8x16; + return vpx_mse8x16; default: - return vp9_mse16x16; + return vpx_mse16x16; } } @@ -292,37 +302,37 @@ static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, default: switch (bsize) { case BLOCK_8X8: - return vp9_highbd_mse8x8; + return vpx_highbd_8_mse8x8; case BLOCK_16X8: - return vp9_highbd_mse16x8; + return vpx_highbd_8_mse16x8; case BLOCK_8X16: - return vp9_highbd_mse8x16; + return vpx_highbd_8_mse8x16; default: - return vp9_highbd_mse16x16; + return vpx_highbd_8_mse16x16; } break; case 10: switch (bsize) { case BLOCK_8X8: - return vp9_highbd_10_mse8x8; + return vpx_highbd_10_mse8x8; case BLOCK_16X8: - return vp9_highbd_10_mse16x8; + return vpx_highbd_10_mse16x8; case BLOCK_8X16: - return vp9_highbd_10_mse8x16; + return vpx_highbd_10_mse8x16; default: - return vp9_highbd_10_mse16x16; + return vpx_highbd_10_mse16x16; } break; case 12: switch (bsize) { case BLOCK_8X8: - return vp9_highbd_12_mse8x8; + return vpx_highbd_12_mse8x8; case BLOCK_16X8: - return vp9_highbd_12_mse16x8; + return vpx_highbd_12_mse16x8; case BLOCK_8X16: - return vp9_highbd_12_mse8x16; + return vpx_highbd_12_mse8x16; default: - return vp9_highbd_12_mse16x16; + return vpx_highbd_12_mse16x16; } break; } @@ -451,6 +461,8 @@ static void set_first_pass_params(VP9_COMP *cpi) { cpi->rc.frames_to_key = INT_MAX; } +#define UL_INTRA_THRESH 50 +#define INVALID_ROW -1 void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { int mb_row, mb_col; MACROBLOCK *const x = &cpi->td.mb; @@ -475,6 +487,8 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { int second_ref_count = 0; const int intrapenalty = INTRA_MODE_PENALTY; double neutral_count; + int intra_skip_count = 0; + int image_data_start_row = INVALID_ROW; int new_mv_count = 0; int sum_in_vectors = 0; MV lastmv = {0, 0}; @@ -633,7 +647,19 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { xd->mi[0]->mbmi.tx_size = use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; vp9_encode_intra_block_plane(x, bsize, 0); - this_error = vp9_get_mb_ss(x->plane[0].src_diff); + this_error = vpx_get_mb_ss(x->plane[0].src_diff); + + // Keep a record of blocks that have almost no intra error residual + // (i.e. are in effect completely flat and untextured in the intra + // domain). In natural videos this is uncommon, but it is much more + // common in animations, graphics and screen content, so may be used + // as a signal to detect these types of content. + if (this_error < UL_INTRA_THRESH) { + ++intra_skip_count; + } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) { + image_data_start_row = mb_row; + } + #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { switch (cm->bit_depth) { @@ -961,6 +987,18 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { vp9_clear_system_state(); } + // Clamp the image start to rows/2. This number of rows is discarded top + // and bottom as dead data so rows / 2 means the frame is blank. + if ((image_data_start_row > cm->mb_rows / 2) || + (image_data_start_row == INVALID_ROW)) { + image_data_start_row = cm->mb_rows / 2; + } + // Exclude any image dead zone + if (image_data_start_row > 0) { + intra_skip_count = + MAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2)); + } + { FIRSTPASS_STATS fps; // The minimum error here insures some bit allocation to frames even @@ -985,6 +1023,8 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { fps.pcnt_inter = (double)intercount / num_mbs; fps.pcnt_second_ref = (double)second_ref_count / num_mbs; fps.pcnt_neutral = (double)neutral_count / num_mbs; + fps.intra_skip_pct = (double)intra_skip_count / num_mbs; + fps.inactive_zone_rows = (double)image_data_start_row; if (mvcount > 0) { fps.MVr = (double)sum_mvr / mvcount; @@ -1106,21 +1146,25 @@ static double calc_correction_factor(double err_per_mb, static int get_twopass_worst_quality(const VP9_COMP *cpi, const double section_err, + double inactive_zone, int section_target_bandwidth, double group_weight_factor) { const RATE_CONTROL *const rc = &cpi->rc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; + inactive_zone = fclamp(inactive_zone, 0.0, 1.0); + if (section_target_bandwidth <= 0) { return rc->worst_quality; // Highest value allowed } else { const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : cpi->common.MBs; - const double err_per_mb = section_err / num_mbs; + const int active_mbs = MAX(1, num_mbs - (int)(num_mbs * inactive_zone)); + const double av_err_per_mb = section_err / active_mbs; const double speed_term = 1.0 + 0.04 * oxcf->speed; - const double ediv_size_correction = num_mbs / EDIV_SIZE_FACTOR; + const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR; const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth << - BPER_MB_NORMBITS) / num_mbs; + BPER_MB_NORMBITS) / active_mbs; int q; int is_svc_upper_layer = 0; @@ -1133,7 +1177,7 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi, // content at the given rate. for (q = rc->best_quality; q < rc->worst_quality; ++q) { const double factor = - calc_correction_factor(err_per_mb, + calc_correction_factor(av_err_per_mb, ERR_DIVISOR - ediv_size_correction, is_svc_upper_layer ? SVC_FACTOR_PT_LOW : FACTOR_PT_LOW, FACTOR_PT_HIGH, q, @@ -1246,8 +1290,9 @@ void vp9_init_second_pass(VP9_COMP *cpi) { twopass->modified_error_left = modified_error_total; } - // Reset the vbr bits off target counter + // Reset the vbr bits off target counters cpi->rc.vbr_bits_off_target = 0; + cpi->rc.vbr_bits_off_target_fast = 0; cpi->rc.rate_error_estimate = 0; @@ -1411,6 +1456,8 @@ static double calc_frame_boost(VP9_COMP *cpi, const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : cpi->common.MBs; + // TODO(paulwilkins): correct for dead zone + // Underlying boost factor is based on inter error ratio. frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error); @@ -1695,7 +1742,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; // Allocate bits to the other frames in the group. - for (i = 0; i < rc->baseline_gf_interval - 1; ++i) { + for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) { int arf_idx = 0; if (EOF == input_stats(twopass, &frame_stats)) break; @@ -1776,6 +1823,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { #if GROUP_ADAPTIVE_MAXQ double gf_group_raw_error = 0.0; #endif + double gf_group_skip_pct = 0.0; + double gf_group_inactive_zone_rows = 0.0; double gf_first_frame_err = 0.0; double mod_frame_err = 0.0; @@ -1825,6 +1874,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { #if GROUP_ADAPTIVE_MAXQ gf_group_raw_error -= this_frame->coded_error; #endif + gf_group_skip_pct -= this_frame->intra_skip_pct; + gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows; } // Motion breakout threshold for loop below depends on image size. @@ -1869,6 +1920,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { #if GROUP_ADAPTIVE_MAXQ gf_group_raw_error += this_frame->coded_error; #endif + gf_group_skip_pct += this_frame->intra_skip_pct; + gf_group_inactive_zone_rows += this_frame->inactive_zone_rows; if (EOF == input_stats(twopass, &next_frame)) break; @@ -1933,8 +1986,26 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Was the group length constrained by the requirement for a new KF? rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; + // Should we use the alternate reference frame. + if (allow_alt_ref && + (i < cpi->oxcf.lag_in_frames) && + (i >= rc->min_gf_interval)) { + // Calculate the boost for alt ref. + rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, + &b_boost); + rc->source_alt_ref_pending = 1; + + // Test to see if multi arf is appropriate. + cpi->multi_arf_enabled = + (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) && + (zero_motion_accumulator < 0.995)) ? 1 : 0; + } else { + rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST); + rc->source_alt_ref_pending = 0; + } + // Set the interval until the next gf. - if (is_key_frame || rc->source_alt_ref_active) + if (is_key_frame || rc->source_alt_ref_pending) rc->baseline_gf_interval = i - 1; else rc->baseline_gf_interval = i; @@ -1953,30 +2024,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { #if GROUP_ADAPTIVE_MAXQ gf_group_raw_error += this_frame->coded_error; #endif + gf_group_skip_pct += this_frame->intra_skip_pct; + gf_group_inactive_zone_rows += this_frame->inactive_zone_rows; } rc->baseline_gf_interval = new_gf_interval; } rc->frames_till_gf_update_due = rc->baseline_gf_interval; - // Should we use the alternate reference frame. - if (allow_alt_ref && - (i < cpi->oxcf.lag_in_frames) && - (i >= rc->min_gf_interval)) { - // Calculate the boost for alt ref. - rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, - &b_boost); - rc->source_alt_ref_pending = 1; - - // Test to see if multi arf is appropriate. - cpi->multi_arf_enabled = - (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) && - (zero_motion_accumulator < 0.995)) ? 1 : 0; - } else { - rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST); - rc->source_alt_ref_pending = 0; - } - // Reset the file position. reset_fpf_position(twopass, start_pos); @@ -1993,6 +2048,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { const int vbr_group_bits_per_frame = (int)(gf_group_bits / rc->baseline_gf_interval); const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval; + const double group_av_skip_pct = + gf_group_skip_pct / rc->baseline_gf_interval; + const double group_av_inactive_zone = + ((gf_group_inactive_zone_rows * 2) / + (rc->baseline_gf_interval * (double)cm->mb_rows)); + int tmp_q; // rc factor is a weight factor that corrects for local rate control drift. double rc_factor = 1.0; @@ -2004,7 +2065,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { (double)(100 - rc->rate_error_estimate) / 100.0); } tmp_q = - get_twopass_worst_quality(cpi, group_av_err, vbr_group_bits_per_frame, + get_twopass_worst_quality(cpi, group_av_err, + (group_av_skip_pct + group_av_inactive_zone), + vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor); twopass->active_worst_quality = MAX(tmp_q, twopass->active_worst_quality >> 1); @@ -2413,7 +2476,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Define the reference buffers that will be updated post encode. -void configure_buffer_updates(VP9_COMP *cpi) { +static void configure_buffer_updates(VP9_COMP *cpi) { TWO_PASS *const twopass = &cpi->twopass; cpi->rc.is_src_frame_alt_ref = 0; @@ -2460,7 +2523,7 @@ void configure_buffer_updates(VP9_COMP *cpi) { } } -int is_skippable_frame(const VP9_COMP *cpi) { +static int is_skippable_frame(const VP9_COMP *cpi) { // If the current frame does not have non-zero motion vector detected in the // first pass, and so do its previous and forward frames, then this frame // can be skipped for partition check, and the partition size is assigned @@ -2543,10 +2606,17 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { // Special case code for first frame. const int section_target_bandwidth = (int)(twopass->bits_left / frames_left); + const double section_length = twopass->total_left_stats.count; const double section_error = - twopass->total_left_stats.coded_error / twopass->total_left_stats.count; + twopass->total_left_stats.coded_error / section_length; + const double section_intra_skip = + twopass->total_left_stats.intra_skip_pct / section_length; + const double section_inactive_zone = + (twopass->total_left_stats.inactive_zone_rows * 2) / + ((double)cm->mb_rows * section_length); const int tmp_q = get_twopass_worst_quality(cpi, section_error, + section_intra_skip + section_inactive_zone, section_target_bandwidth, DEFAULT_GRP_WEIGHT); twopass->active_worst_quality = tmp_q; @@ -2562,6 +2632,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { if (EOF == input_stats(twopass, &this_frame)) return; + // Set the frame content type flag. + if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH) + twopass->fr_content_type = FC_GRAPHICS_ANIMATION; + else + twopass->fr_content_type = FC_NORMAL; + // Keyframe and section processing. if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) { FIRSTPASS_STATS this_frame_copy; @@ -2580,9 +2656,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); lc->frames_from_key_frame = 0; - // Reset the empty frame resolution since we have a key frame. - cpi->svc.empty_frame_width = cm->width; - cpi->svc.empty_frame_height = cm->height; + // Encode an intra only empty frame since we have a key frame. + cpi->svc.encode_intra_empty_frame = 1; } } else { cm->frame_type = INTER_FRAME; @@ -2649,6 +2724,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { #define MINQ_ADJ_LIMIT 48 #define MINQ_ADJ_LIMIT_CQ 20 +#define HIGH_UNDERSHOOT_RATIO 2 void vp9_twopass_postencode_update(VP9_COMP *cpi) { TWO_PASS *const twopass = &cpi->twopass; RATE_CONTROL *const rc = &cpi->rc; @@ -2715,5 +2791,32 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit); twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit); + + // If there is a big and undexpected undershoot then feed the extra + // bits back in quickly. One situation where this may happen is if a + // frame is unexpectedly almost perfectly predicted by the ARF or GF + // but not very well predcited by the previous frame. + if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) { + int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO; + if (rc->projected_frame_size < fast_extra_thresh) { + rc->vbr_bits_off_target_fast += + fast_extra_thresh - rc->projected_frame_size; + rc->vbr_bits_off_target_fast = + MIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth)); + + // Fast adaptation of minQ if necessary to use up the extra bits. + if (rc->avg_frame_bandwidth) { + twopass->extend_minq_fast = + (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth); + } + twopass->extend_minq_fast = MIN(twopass->extend_minq_fast, + minq_adj_limit - twopass->extend_minq); + } else if (rc->vbr_bits_off_target_fast) { + twopass->extend_minq_fast = MIN(twopass->extend_minq_fast, + minq_adj_limit - twopass->extend_minq); + } else { + twopass->extend_minq_fast = 0; + } + } } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h index 08e7a8bf114..00479322d31 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h @@ -51,6 +51,8 @@ typedef struct { double pcnt_motion; double pcnt_second_ref; double pcnt_neutral; + double intra_skip_pct; + double inactive_zone_rows; // Image mask rows top and bottom. double MVr; double mvr_abs; double MVc; @@ -73,6 +75,13 @@ typedef enum { FRAME_UPDATE_TYPES = 5 } FRAME_UPDATE_TYPE; +#define FC_ANIMATION_THRESH 0.15 +typedef enum { + FC_NORMAL = 0, + FC_GRAPHICS_ANIMATION = 1, + FRAME_CONTENT_TYPES = 2 +} FRAME_CONTENT_TYPE; + typedef struct { unsigned char index; RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1]; @@ -103,6 +112,8 @@ typedef struct { uint8_t *this_frame_mb_stats; FIRSTPASS_MB_STATS firstpass_mb_stats; #endif + // An indication of the content type of the current frame + FRAME_CONTENT_TYPE fr_content_type; // Projected total bits available for a key frame group of frames int64_t kf_group_bits; @@ -122,6 +133,7 @@ typedef struct { int baseline_active_worst_quality; int extend_minq; int extend_maxq; + int extend_minq_fast; GF_GROUP gf_group; } TWO_PASS; @@ -135,6 +147,7 @@ void vp9_end_first_pass(struct VP9_COMP *cpi); void vp9_init_second_pass(struct VP9_COMP *cpi); void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi); +void vp9_twopass_postencode_update(struct VP9_COMP *cpi); // Post encode update of the rate control parameters for 2-pass void vp9_twopass_postencode_update(struct VP9_COMP *cpi); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c index 80c509a1b49..081b99f9f1f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c @@ -13,10 +13,13 @@ #include <stdio.h> #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_mcomp.h" @@ -159,9 +162,9 @@ void vp9_init3smotion_compensation(search_site_config *cfg, int stride) { error_per_bit + 4096) >> 13 : 0) -// convert motion vector component to offset for svf calc +// convert motion vector component to offset for sv[a]f calc static INLINE int sp(int x) { - return (x & 7) << 1; + return x & 7; } static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { @@ -283,32 +286,32 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { bestmv->row *= 8; \ bestmv->col *= 8; -static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd, - const MV *bestmv, - const MV *ref_mv, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - const uint8_t *const src, - const int src_stride, - const uint8_t *const y, - int y_stride, - const uint8_t *second_pred, - int w, int h, int offset, - int *mvjcost, int *mvcost[2], - unsigned int *sse1, - int *distortion) { +static unsigned int setup_center_error(const MACROBLOCKD *xd, + const MV *bestmv, + const MV *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + const uint8_t *const src, + const int src_stride, + const uint8_t *const y, + int y_stride, + const uint8_t *second_pred, + int w, int h, int offset, + int *mvjcost, int *mvcost[2], + unsigned int *sse1, + int *distortion) { unsigned int besterr; #if CONFIG_VP9_HIGHBITDEPTH if (second_pred != NULL) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); - vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, + vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); } else { DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); - vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } } else { @@ -320,7 +323,7 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd, (void) xd; if (second_pred != NULL) { DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); - vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } else { besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1); @@ -607,7 +610,7 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x, return besterr; } -const MV search_step_table[12] = { +static const MV search_step_table[12] = { // left, right, up, down {0, -4}, {0, 4}, {-4, 0}, {4, 0}, {0, -2}, {0, 2}, {-2, 0}, {2, 0}, @@ -676,16 +679,14 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x, tc = bc + search_step[idx].col; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); - int row_offset = (tr & 0x07) << 1; - int col_offset = (tc & 0x07) << 1; MV this_mv; this_mv.row = tr; this_mv.col = tc; if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset, + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, src_stride, &sse); else - thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset, + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), src_address, src_stride, &sse, second_pred); cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -706,14 +707,12 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x, tr = br + (cost_array[2] < cost_array[3] ? -hstep : hstep); if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); - int row_offset = (tr & 0x07) << 1; - int col_offset = (tc & 0x07) << 1; MV this_mv = {tr, tc}; if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset, + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, src_stride, &sse); else - thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset, + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), src_address, src_stride, &sse, second_pred); cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -1788,8 +1787,11 @@ static const MV search_pos[4] = { }; unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize) { + BLOCK_SIZE bsize, + int mi_row, int mi_col) { MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}}; DECLARE_ALIGNED(16, int16_t, hbuf[128]); DECLARE_ALIGNED(16, int16_t, vbuf[128]); DECLARE_ALIGNED(16, int16_t, src_hbuf[64]); @@ -1806,12 +1808,34 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, unsigned int best_sad, tmp_sad, this_sad[4]; MV this_mv; const int norm_factor = 3 + (bw >> 5); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]); + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[i] = xd->plane[i].pre[0]; + vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); + } #if CONFIG_VP9_HIGHBITDEPTH - tmp_mv->row = 0; - tmp_mv->col = 0; - return cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, - xd->plane[0].pre[0].buf, ref_stride); + { + unsigned int this_sad; + tmp_mv->row = 0; + tmp_mv->col = 0; + this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, + xd->plane[0].pre[0].buf, ref_stride); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[0] = backup_yv12[i]; + } + return this_sad; + } #endif // Set up prediction 1-D reference set @@ -1889,6 +1913,12 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, tmp_mv->row *= 8; tmp_mv->col *= 8; + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[0] = backup_yv12[i]; + } + return best_sad; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h index dd8a4607942..99c1afa28ff 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h @@ -83,7 +83,8 @@ int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x, // Perform integral projection based motion estimation. unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize); + BLOCK_SIZE bsize, + int mi_row, int mi_col); typedef int (integer_mv_pattern_search_fn) ( const MACROBLOCK *x, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c index 5eb5d542b78..8e191038514 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c @@ -14,6 +14,7 @@ #include "./vpx_scale_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_onyxc_int.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c index 9fb7cfba7bf..3eaa99054ce 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c @@ -14,8 +14,10 @@ #include <stdio.h> #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" @@ -23,6 +25,7 @@ #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_cost.h" #include "vp9/encoder/vp9_encoder.h" @@ -214,7 +217,7 @@ static void block_variance(const uint8_t *src, int src_stride, for (i = 0; i < h; i += block_size) { for (j = 0; j < w; j += block_size) { - vp9_get8x8var(src + src_stride * i + j, src_stride, + vpx_get8x8var(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, ref_stride, &sse8x8[k], &sum8x8[k]); *sse += sse8x8[k]; @@ -294,13 +297,11 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, else tx_size = TX_8X8; - if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) { - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && - cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id)) - tx_size = TX_8X8; - else if (tx_size > TX_16X16) - tx_size = TX_16X16; - } + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id)) + tx_size = TX_8X8; + else if (tx_size > TX_16X16) + tx_size = TX_16X16; } else { tx_size = MIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); @@ -478,13 +479,11 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, else xd->mi[0]->mbmi.tx_size = TX_8X8; - if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) { - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && - cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id)) - xd->mi[0]->mbmi.tx_size = TX_8X8; - else if (xd->mi[0]->mbmi.tx_size > TX_16X16) - xd->mi[0]->mbmi.tx_size = TX_16X16; - } + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id)) + xd->mi[0]->mbmi.tx_size = TX_8X8; + else if (xd->mi[0]->mbmi.tx_size > TX_16X16) + xd->mi[0]->mbmi.tx_size = TX_16X16; } else { xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize], @@ -659,7 +658,8 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, block = 0; *rate = 0; *dist = 0; - *sse = (*sse << 6) >> shift; + if (*sse < INT64_MAX) + *sse = (*sse << 6) >> shift; for (r = 0; r < max_blocks_high; r += block_step) { for (c = 0; c < num_4x4_w; c += block_step) { if (c < max_blocks_wide) { @@ -1059,6 +1059,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; @@ -1078,9 +1079,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, unsigned int var_y = UINT_MAX; unsigned int sse_y = UINT_MAX; // Reduce the intra cost penalty for small blocks (<=16x16). - const int reduction_fac = - (cpi->sf.partition_search_type == VAR_BASED_PARTITION && - bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0; + const int reduction_fac = (bsize <= BLOCK_16X16) ? + ((bsize <= BLOCK_8X8) ? 4 : 2) : 0; const int intra_cost_penalty = vp9_get_intra_cost_penalty( cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac; const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, @@ -1180,7 +1180,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (cm->use_prev_frame_mvs) vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame, - candidates, mi_row, mi_col, NULL, NULL); + candidates, mi_row, mi_col, NULL, NULL, + xd->mi[0]->mbmi.mode_context); else const_motion[ref_frame] = mv_refs_rt(cm, xd, tile_info, xd->mi[0], @@ -1219,7 +1220,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, continue; i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; - if (cpi->ref_frame_flags & flag_list[i]) + if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking) if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) ref_frame_skip_mask |= (1 << ref_frame); if (ref_frame_skip_mask & (1 << ref_frame)) @@ -1247,7 +1248,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (bsize < BLOCK_16X16) continue; - tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize); + tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) continue; @@ -1594,7 +1595,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (cpi->sf.adaptive_rd_thresh) { THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mbmi->mode)]; - PREDICTION_MODE this_mode; if (best_ref_frame == INTRA_FRAME) { // Only consider the modes that are included in the intra_mode_list. @@ -1604,12 +1604,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // TODO(yunqingwang): Check intra mode mask and only update freq_fact // for those valid modes. for (i = 0; i < intra_modes; i++) { - PREDICTION_MODE this_mode = intra_mode_list[i]; update_thresh_freq_fact(cpi, tile_data, bsize, INTRA_FRAME, - best_mode_idx, this_mode); + best_mode_idx, intra_mode_list[i]); } } else { for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { + PREDICTION_MODE this_mode; if (best_ref_frame != ref_frame) continue; for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { update_thresh_freq_fact(cpi, tile_data, bsize, ref_frame, @@ -1660,7 +1660,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame, - candidates, mi_row, mi_col, NULL, NULL); + candidates, mi_row, mi_col, NULL, NULL, + xd->mi[0]->mbmi.mode_context); vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, &dummy_mv[0], &dummy_mv[1]); @@ -1692,8 +1693,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // If the segment reference frame feature is enabled.... // then do nothing if the current ref frame is not allowed.. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && - vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) continue; mbmi->ref_frame[0] = ref_frame; @@ -1734,7 +1735,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, b_mv[NEWMV].as_int = INVALID_MV; vp9_append_sub8x8_mvs_for_idx(cm, xd, tile_info, i, 0, mi_row, mi_col, &b_mv[NEARESTMV], - &b_mv[NEARMV]); + &b_mv[NEARMV], + xd->mi[0]->mbmi.mode_context); for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { int b_rate = 0; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c index 3c07e2c2437..e6e17c073e3 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c @@ -11,6 +11,7 @@ #include <math.h> #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_seg_common.h" @@ -677,7 +678,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1]; } - x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); + x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); x->q_index = qindex; x->errorperbit = rdmult >> 6; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c index 4c33ffd977b..85003f65ef1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c @@ -16,6 +16,7 @@ #include <string.h> #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/encoder/vp9_aq_cyclicrefresh.h" @@ -136,7 +137,7 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, } } -void vp9_rc_init_minq_luts() { +void vp9_rc_init_minq_luts(void) { init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8, arfgf_low_motion_minq_8, arfgf_high_motion_minq_8, inter_minq_8, rtc_minq_8, VPX_BITS_8); @@ -233,13 +234,16 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) { return target; } -// Update the buffer level for higher layers, given the encoded current layer. +// Update the buffer level for higher temporal layers, given the encoded current +// temporal layer. static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { - int temporal_layer = 0; + int i = 0; int current_temporal_layer = svc->temporal_layer_id; - for (temporal_layer = current_temporal_layer + 1; - temporal_layer < svc->number_temporal_layers; ++temporal_layer) { - LAYER_CONTEXT *lc = &svc->layer_context[temporal_layer]; + for (i = current_temporal_layer + 1; + i < svc->number_temporal_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate - encoded_frame_size); @@ -267,7 +271,7 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size); rc->buffer_level = rc->bits_off_target; - if (cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR) { + if (is_one_pass_cbr_svc(cpi)) { update_layer_buffer_level(&cpi->svc, encoded_frame_size); } } @@ -491,7 +495,10 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, i = active_best_quality; do { - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cm->seg.enabled && + cpi->svc.temporal_layer_id == 0 && + cpi->svc.spatial_layer_id == 0) { bits_per_mb_at_this_q = (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor); } else { @@ -1057,10 +1064,12 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, if (frame_is_intra_only(cm) || (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { - active_best_quality -= cpi->twopass.extend_minq; + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); active_worst_quality += (cpi->twopass.extend_maxq / 2); } else { - active_best_quality -= cpi->twopass.extend_minq / 2; + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2; active_worst_quality += cpi->twopass.extend_maxq; } } @@ -1203,11 +1212,9 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { // this frame refreshes means next frames don't unless specified by user rc->frames_since_golden = 0; - if (cpi->oxcf.pass == 2) { - if (!rc->source_alt_ref_pending && - cpi->twopass.gf_group.rf_level[0] == GF_ARF_STD) - rc->source_alt_ref_active = 0; - } else if (!rc->source_alt_ref_pending) { + // If we are not using alt ref in the up and coming group clear the arf + // active flag. + if (!rc->source_alt_ref_pending) { rc->source_alt_ref_active = 0; } @@ -1414,13 +1421,14 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { } else { target = rc->avg_frame_bandwidth; } - if (svc->number_temporal_layers > 1 && - oxcf->rc_mode == VPX_CBR) { + if (is_one_pass_cbr_svc(cpi)) { // Note that for layers, avg_frame_bandwidth is the cumulative // per-frame-bandwidth. For the target size of this frame, use the // layer average frame size (i.e., non-cumulative per-frame-bw). - int current_temporal_layer = svc->temporal_layer_id; - const LAYER_CONTEXT *lc = &svc->layer_context[current_temporal_layer]; + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, + svc->temporal_layer_id, svc->number_temporal_layers); + const LAYER_CONTEXT *lc = &svc->layer_context[layer]; target = lc->avg_frame_size; min_frame_target = MAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS); } @@ -1455,7 +1463,9 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { if (svc->number_temporal_layers > 1 && oxcf->rc_mode == VPX_CBR) { // Use the layer framerate for temporal layers CBR mode. - const LAYER_CONTEXT *lc = &svc->layer_context[svc->temporal_layer_id]; + const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, + svc->temporal_layer_id, svc->number_temporal_layers); + const LAYER_CONTEXT *lc = &svc->layer_context[layer]; framerate = lc->framerate; } kf_boost = MAX(kf_boost, (int)(2 * framerate - 16)); @@ -1468,10 +1478,27 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { return vp9_rc_clamp_iframe_target_size(cpi, target); } +// Reset information needed to set proper reference frames and buffer updates +// for temporal layering. This is called when a key frame is encoded. +static void reset_temporal_layer_to_zero(VP9_COMP *cpi) { + int sl; + LAYER_CONTEXT *lc = NULL; + cpi->svc.temporal_layer_id = 0; + + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { + lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers]; + lc->current_video_frame_in_layer = 0; + lc->frames_from_key_frame = 0; + } +} + void vp9_rc_get_svc_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target = rc->avg_frame_bandwidth; + const int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, + cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers); + if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) || (cpi->oxcf.auto_key && (rc->frames_since_key % @@ -1480,30 +1507,39 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { rc->source_alt_ref_active = 0; if (is_two_pass_svc(cpi)) { - cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1; + cpi->svc.layer_context[layer].is_key_frame = 1; cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); - } - - if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) { + } else if (is_one_pass_cbr_svc(cpi)) { + cpi->svc.layer_context[layer].is_key_frame = 1; + reset_temporal_layer_to_zero(cpi); + cpi->ref_frame_flags &= + (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); + // Assumption here is that LAST_FRAME is being updated for a keyframe. + // Thus no change in update flags. target = calc_iframe_target_size_one_pass_cbr(cpi); } } else { cm->frame_type = INTER_FRAME; - if (is_two_pass_svc(cpi)) { - LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; if (cpi->svc.spatial_layer_id == 0) { lc->is_key_frame = 0; } else { - lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; + lc->is_key_frame = + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame; if (lc->is_key_frame) cpi->ref_frame_flags &= (~VP9_LAST_FLAG); } cpi->ref_frame_flags &= (~VP9_ALT_FLAG); - } - - if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) { + } else if (is_one_pass_cbr_svc(cpi)) { + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + if (cpi->svc.spatial_layer_id == 0) { + lc->is_key_frame = 0; + } else { + lc->is_key_frame = + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame; + } target = calc_pframe_target_size_one_pass_cbr(cpi); } } @@ -1560,6 +1596,10 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { target = calc_pframe_target_size_one_pass_cbr(cpi); vp9_rc_set_frame_target(cpi, target); + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC) + cpi->resize_state = vp9_resize_one_pass_cbr(cpi); + else + cpi->resize_state = 0; } int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, @@ -1669,9 +1709,9 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) { #define VBR_PCT_ADJUSTMENT_LIMIT 50 // For VBR...adjustment to the frame target based on error from previous frames -static void vbr_rate_correction(VP9_COMP *cpi, - int *this_frame_target, - int64_t vbr_bits_off_target) { +static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) { + RATE_CONTROL *const rc = &cpi->rc; + int64_t vbr_bits_off_target = rc->vbr_bits_off_target; int max_delta; double position_factor = 1.0; @@ -1695,6 +1735,20 @@ static void vbr_rate_correction(VP9_COMP *cpi, (vbr_bits_off_target < -max_delta) ? max_delta : (int)-vbr_bits_off_target; } + + // Fast redistribution of bits arising from massive local undershoot. + // Dont do it for kf,arf,gf or overlay frames. + if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref && + rc->vbr_bits_off_target_fast) { + int one_frame_bits = MAX(rc->avg_frame_bandwidth, *this_frame_target); + int fast_extra_bits; + fast_extra_bits = + (int)MIN(rc->vbr_bits_off_target_fast, one_frame_bits); + fast_extra_bits = (int)MIN(fast_extra_bits, + MAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8)); + *this_frame_target += (int)fast_extra_bits; + rc->vbr_bits_off_target_fast -= fast_extra_bits; + } } void vp9_set_target_rate(VP9_COMP *cpi) { @@ -1703,6 +1757,95 @@ void vp9_set_target_rate(VP9_COMP *cpi) { // Correction to rate target based on prior over or under shoot. if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ) - vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target); + vbr_rate_correction(cpi, &target_rate); vp9_rc_set_frame_target(cpi, target_rate); } + +// Check if we should resize, based on average QP from past x frames. +// Only allow for resize at most one scale down for now, scaling factor is 2. +int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int resize_now = 0; + cpi->resize_scale_num = 1; + cpi->resize_scale_den = 1; + // Don't resize on key frame; reset the counters on key frame. + if (cm->frame_type == KEY_FRAME) { + cpi->resize_avg_qp = 0; + cpi->resize_count = 0; + return 0; + } + // Resize based on average QP over some window. + // Ignore samples close to key frame, since QP is usually high after key. + if (cpi->rc.frames_since_key > 2 * cpi->framerate) { + const int window = (int)(5 * cpi->framerate); + cpi->resize_avg_qp += cm->base_qindex; + if (cpi->rc.buffer_level < 0) + ++cpi->resize_buffer_underflow; + ++cpi->resize_count; + // Check for resize action every "window" frames. + if (cpi->resize_count >= window) { + int avg_qp = cpi->resize_avg_qp / cpi->resize_count; + // Resize down if buffer level has underflowed sufficent amount in past + // window, and we are at original resolution. + // Resize back up if average QP is low, and we are currently in a resized + // down state. + if (cpi->resize_state == 0 && + cpi->resize_buffer_underflow > (cpi->resize_count >> 3)) { + resize_now = 1; + } else if (cpi->resize_state == 1 && + avg_qp < 40 * cpi->rc.worst_quality / 100) { + resize_now = -1; + } + // Reset for next window measurement. + cpi->resize_avg_qp = 0; + cpi->resize_count = 0; + cpi->resize_buffer_underflow = 0; + } + } + // If decision is to resize, reset some quantities, and check is we should + // reduce rate correction factor, + if (resize_now != 0) { + int target_bits_per_frame; + int active_worst_quality; + int qindex; + int tot_scale_change; + // For now, resize is by 1/2 x 1/2. + cpi->resize_scale_num = 1; + cpi->resize_scale_den = 2; + tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) / + (cpi->resize_scale_num * cpi->resize_scale_num); + // Reset buffer level to optimal, update target size. + rc->buffer_level = rc->optimal_buffer_level; + rc->bits_off_target = rc->optimal_buffer_level; + rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi); + // Reset cyclic refresh parameters. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) + vp9_cyclic_refresh_reset_resize(cpi); + // Get the projected qindex, based on the scaled target frame size (scaled + // so target_bits_per_mb in vp9_rc_regulate_q will be correct target). + target_bits_per_frame = (resize_now == 1) ? + rc->this_frame_target * tot_scale_change : + rc->this_frame_target / tot_scale_change; + active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi); + qindex = vp9_rc_regulate_q(cpi, + target_bits_per_frame, + rc->best_quality, + active_worst_quality); + // If resize is down, check if projected q index is close to worst_quality, + // and if so, reduce the rate correction factor (since likely can afford + // lower q for resized frame). + if (resize_now == 1 && + qindex > 90 * cpi->rc.worst_quality / 100) { + rc->rate_correction_factors[INTER_NORMAL] *= 0.85; + } + // If resize is back up, check if projected q index is too much above the + // current base_qindex, and if so, reduce the rate correction factor + // (since prefer to keep q for resized frame at least close to previous q). + if (resize_now == -1 && + qindex > 130 * cm->base_qindex / 100) { + rc->rate_correction_factors[INTER_NORMAL] *= 0.9; + } + } + return resize_now; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h index 869f6e59e97..a10836c7444 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h @@ -100,6 +100,7 @@ typedef struct { int64_t buffer_level; int64_t bits_off_target; int64_t vbr_bits_off_target; + int64_t vbr_bits_off_target_fast; int decimation_factor; int decimation_count; @@ -152,7 +153,7 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs, double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth); -void vp9_rc_init_minq_luts(); +void vp9_rc_init_minq_luts(void); // Generally at the high level, the following flow is expected // to be enforced for rate control: @@ -244,6 +245,8 @@ void vp9_rc_set_gf_interval_range(const struct VP9_COMP *const cpi, void vp9_set_target_rate(struct VP9_COMP *cpi); +int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c index 194001c51a2..90ee1e44a1b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c @@ -15,6 +15,7 @@ #include "./vp9_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_entropy.h" @@ -128,7 +129,7 @@ static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range, } } -void vp9_init_me_luts() { +void vp9_init_me_luts(void) { init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE, VPX_BITS_8); #if CONFIG_VP9_HIGHBITDEPTH @@ -264,6 +265,7 @@ static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) { void vp9_initialize_rd_consts(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; RD_OPT *const rd = &cpi->rd; int i; @@ -279,6 +281,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { cm->frame_type != KEY_FRAME) ? 0 : 1; set_block_thresholds(cm, rd); + set_partition_probs(cm, xd); if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) fill_token_costs(x->token_costs, cm->fc->coef_probs); @@ -286,7 +289,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { if (cpi->sf.partition_search_type != VAR_BASED_PARTITION || cm->frame_type == KEY_FRAME) { for (i = 0; i < PARTITION_CONTEXTS; ++i) - vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i), + vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i), vp9_partition_tree); } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h index 4d247342b0a..7ba2568fe68 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h @@ -150,7 +150,7 @@ int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize, YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi, int ref_frame); -void vp9_init_me_luts(); +void vp9_init_me_luts(void); void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c index 73825623748..162d4de5f5d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c @@ -14,6 +14,7 @@ #include "./vp9_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_entropy.h" @@ -24,6 +25,7 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_systemdependent.h" @@ -1802,7 +1804,8 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, frame_mv[ZEROMV][frame].as_int = 0; vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame], - &frame_mv[NEARMV][frame]); + &frame_mv[NEARMV][frame], + xd->mi[0]->mbmi.mode_context); } // search for the best motion vector on this segment @@ -2117,8 +2120,8 @@ static void estimate_ref_frame_costs(const VP9_COMMON *cm, unsigned int *ref_costs_single, unsigned int *ref_costs_comp, vp9_prob *comp_mode_p) { - int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id, - SEG_LVL_REF_FRAME); + int seg_ref_active = segfeature_active(&cm->seg, segment_id, + SEG_LVL_REF_FRAME); if (seg_ref_active) { memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single)); memset(ref_costs_comp, 0, MAX_REF_FRAMES * sizeof(*ref_costs_comp)); @@ -2218,7 +2221,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, // Gets an initial list of candidate vectors from neighbours and orders them vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col, - NULL, NULL); + NULL, NULL, xd->mi[0]->mbmi.mode_context); // Candidate refinement carried out at encoder and decoder vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, @@ -3004,8 +3007,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, } // If the segment reference frame feature is enabled.... // then do nothing if the current ref frame is not allowed.. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && - vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { ref_frame_skip_mask[0] |= (1 << ref_frame); ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; } @@ -3014,7 +3017,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. - if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, // unless ARNR filtering is enabled in which case we want // an unfiltered alternative. We allow near/nearest as well @@ -3193,7 +3196,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && @@ -3635,7 +3638,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, rd_cost->rate = INT_MAX; - assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); + assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); mbmi->mode = ZEROMV; mbmi->uv_mode = DC_PRED; @@ -3847,7 +3850,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && @@ -3872,13 +3875,13 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, // If the segment reference frame feature is enabled.... // then do nothing if the current ref frame is not allowed.. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && - vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { continue; // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. - } else if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, // unless ARNR filtering is enabled in which case we want // an unfiltered alternative. We allow near/nearest as well diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c index 2ebdff291d6..f46cad80491 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c @@ -15,6 +15,7 @@ #include <stdlib.h> #include <string.h> +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/encoder/vp9_resize.h" @@ -28,7 +29,7 @@ typedef int16_t interp_kernel[INTERP_TAPS]; // Filters for interpolation (0.5-band) - note this also filters integer pels. -const interp_kernel vp9_filteredinterp_filters500[(1 << SUBPEL_BITS)] = { +static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS)] = { {-3, 0, 35, 64, 35, 0, -3, 0}, {-3, -1, 34, 64, 36, 1, -3, 0}, {-3, -1, 32, 64, 38, 1, -3, 0}, @@ -64,7 +65,7 @@ const interp_kernel vp9_filteredinterp_filters500[(1 << SUBPEL_BITS)] = { }; // Filters for interpolation (0.625-band) - note this also filters integer pels. -const interp_kernel vp9_filteredinterp_filters625[(1 << SUBPEL_BITS)] = { +static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS)] = { {-1, -8, 33, 80, 33, -8, -1, 0}, {-1, -8, 30, 80, 35, -8, -1, 1}, {-1, -8, 28, 80, 37, -7, -2, 1}, @@ -100,7 +101,7 @@ const interp_kernel vp9_filteredinterp_filters625[(1 << SUBPEL_BITS)] = { }; // Filters for interpolation (0.75-band) - note this also filters integer pels. -const interp_kernel vp9_filteredinterp_filters750[(1 << SUBPEL_BITS)] = { +static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS)] = { {2, -11, 25, 96, 25, -11, 2, 0}, {2, -11, 22, 96, 28, -11, 2, 0}, {2, -10, 19, 95, 31, -11, 2, 0}, @@ -136,7 +137,7 @@ const interp_kernel vp9_filteredinterp_filters750[(1 << SUBPEL_BITS)] = { }; // Filters for interpolation (0.875-band) - note this also filters integer pels. -const interp_kernel vp9_filteredinterp_filters875[(1 << SUBPEL_BITS)] = { +static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS)] = { {3, -8, 13, 112, 13, -8, 3, 0}, {3, -7, 10, 112, 17, -9, 3, -1}, {2, -6, 7, 111, 21, -9, 3, -1}, @@ -172,7 +173,7 @@ const interp_kernel vp9_filteredinterp_filters875[(1 << SUBPEL_BITS)] = { }; // Filters for interpolation (full-band) - no filtering for integer pixels -const interp_kernel vp9_filteredinterp_filters1000[(1 << SUBPEL_BITS)] = { +static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS)] = { {0, 0, 0, 128, 0, 0, 0, 0}, {0, 1, -3, 128, 3, -1, 0, 0}, {-1, 2, -6, 127, 7, -2, 1, 0}, @@ -214,15 +215,15 @@ static const int16_t vp9_down2_symodd_half_filter[] = {64, 35, 0, -3}; static const interp_kernel *choose_interp_filter(int inlength, int outlength) { int outlength16 = outlength * 16; if (outlength16 >= inlength * 16) - return vp9_filteredinterp_filters1000; + return filteredinterp_filters1000; else if (outlength16 >= inlength * 13) - return vp9_filteredinterp_filters875; + return filteredinterp_filters875; else if (outlength16 >= inlength * 11) - return vp9_filteredinterp_filters750; + return filteredinterp_filters750; else if (outlength16 >= inlength * 9) - return vp9_filteredinterp_filters625; + return filteredinterp_filters625; else - return vp9_filteredinterp_filters500; + return filteredinterp_filters500; } static void interpolate(const uint8_t *const input, int inlength, @@ -427,7 +428,7 @@ static int get_down2_length(int length, int steps) { return length; } -int get_down2_steps(int in_length, int out_length) { +static int get_down2_steps(int in_length, int out_length) { int steps = 0; int proj_in_length; while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c index 4f93578326b..6c6c4ed3022 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c @@ -19,9 +19,33 @@ static int frame_is_boosted(const VP9_COMP *cpi) { return frame_is_kf_gf_arf(cpi) || vp9_is_upper_layer_key_frame(cpi); } -static void set_good_speed_feature_framesize_dependent(VP9_COMMON *cm, +// Sets a partition size down to which the auto partition code will always +// search (can go lower), based on the image dimensions. The logic here +// is that the extent to which ringing artefacts are offensive, depends +// partly on the screen area that over which they propogate. Propogation is +// limited by transform block size but the screen area take up by a given block +// size will be larger for a small image format stretched to full screen. +static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) { + unsigned int screen_area = (cm->width * cm->height); + + // Select block size based on image format size. + if (screen_area < 1280 * 720) { + // Formats smaller in area than 720P + return BLOCK_4X4; + } else if (screen_area < 1920 * 1080) { + // Format >= 720P and < 1080P + return BLOCK_8X8; + } else { + // Formats 1080P and up + return BLOCK_16X16; + } +} + +static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed) { + VP9_COMMON *const cm = &cpi->common; + if (speed >= 1) { if (MIN(cm->width, cm->height) >= 720) { sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT @@ -45,6 +69,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMMON *cm, sf->partition_search_breakout_dist_thr = (1 << 22); sf->partition_search_breakout_rate_thr = 100; } + sf->rd_auto_partition_min_limit = set_partition_min_limit(cm); } if (speed >= 3) { @@ -62,6 +87,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMMON *cm, } } + // If this is a two pass clip that fits the criteria for animated or + // graphics content then reset disable_split_mask for speeds 1-4. + if ((speed >= 1) && (cpi->oxcf.pass == 2) && + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)) { + sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + } + if (speed >= 4) { if (MIN(cm->width, cm->height) >= 720) { sf->partition_search_breakout_dist_thr = (1 << 26); @@ -72,29 +104,6 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMMON *cm, } } -// Sets a partition size down to which the auto partition code will always -// search (can go lower), based on the image dimensions. The logic here -// is that the extent to which ringing artefacts are offensive, depends -// partly on the screen area that over which they propogate. Propogation is -// limited by transform block size but the screen area take up by a given block -// size will be larger for a small image format stretched to full screen. -static BLOCK_SIZE set_partition_min_limit(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - unsigned int screen_area = (cm->width * cm->height); - - // Select block size based on image format size. - if (screen_area < 1280 * 720) { - // Formats smaller in area than 720P - return BLOCK_4X4; - } else if (screen_area < 1920 * 1080) { - // Format >= 720P and < 1080P - return BLOCK_8X8; - } else { - // Formats 1080P and up - return BLOCK_16X16; - } -} - static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, SPEED_FEATURES *sf, int speed) { const int boosted = frame_is_boosted(cpi); @@ -139,7 +148,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->disable_filter_search_var_thresh = 100; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; - sf->rd_auto_partition_min_limit = set_partition_min_limit(cpi); sf->allow_partition_search_skip = 1; } @@ -260,8 +268,12 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, FLAG_SKIP_INTRA_LOWVAR; sf->adaptive_pred_interp_filter = 2; - // Reference masking is not supported in dynamic scaling mode. - sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0; + // Disable reference masking if using spatial scaling since + // pred_mv_sad will not be set (since vp9_mv_pred will not + // be called). + // TODO(marpan/agrange): Fix this condition. + sf->reference_masking = (cpi->oxcf.resize_mode != RESIZE_DYNAMIC && + cpi->svc.number_spatial_layers == 1) ? 1 : 0; sf->disable_filter_search_var_thresh = 50; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; @@ -337,6 +349,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->coeff_prob_appx_step = 4; sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED; sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH; + sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8; if (!is_keyframe) { int i; @@ -360,7 +373,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, // Turn on this to use non-RD key frame coding mode. sf->use_nonrd_pick_mode = 1; sf->mv.search_method = NSTEP; - sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8; sf->mv.reduce_first_step_size = 1; sf->skip_encode_sb = 0; } @@ -379,7 +391,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { SPEED_FEATURES *const sf = &cpi->sf; - VP9_COMMON *const cm = &cpi->common; const VP9EncoderConfig *const oxcf = &cpi->oxcf; RD_OPT *const rd = &cpi->rd; int i; @@ -387,7 +398,7 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { if (oxcf->mode == REALTIME) { set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); } else if (oxcf->mode == GOOD) { - set_good_speed_feature_framesize_dependent(cm, sf, oxcf->speed); + set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); } if (sf->disable_split_mask == DISABLE_ALL_SPLIT) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c index 88db5dda06d..172de5d1daa 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c @@ -10,6 +10,7 @@ #include <math.h> #include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" #include "vp9/encoder/vp9_ssim.h" void vp9_ssim_parms_16x16_c(uint8_t *s, int sp, uint8_t *r, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c index cfdc90d15fb..b345b162cdb 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c @@ -12,6 +12,7 @@ #include "vp9/common/vp9_entropy.h" #include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_subexp.h" #include "vp9/encoder/vp9_writer.h" #define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd))) diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h index ac54893cf45..6fbb747e7d3 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h @@ -16,11 +16,15 @@ extern "C" { #endif -void vp9_write_prob_diff_update(vp9_writer *w, +#include "vp9/common/vp9_prob.h" + +struct vp9_writer; + +void vp9_write_prob_diff_update(struct vp9_writer *w, vp9_prob newp, vp9_prob oldp); -void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp, - unsigned int *ct); +void vp9_cond_prob_diff_update(struct vp9_writer *w, vp9_prob *oldp, + const unsigned int ct[2]); int vp9_prob_diff_update_savings_search(const unsigned int *ct, vp9_prob oldp, vp9_prob *bestp, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c index b3491a27a4a..1b35ac9b61e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -15,89 +15,85 @@ #include "vp9/encoder/vp9_extend.h" #define SMALL_FRAME_FB_IDX 7 +#define SMALL_FRAME_WIDTH 16 +#define SMALL_FRAME_HEIGHT 16 void vp9_init_layer_context(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; - int layer; - int layer_end; + int sl, tl; int alt_ref_idx = svc->number_spatial_layers; svc->spatial_layer_id = 0; svc->temporal_layer_id = 0; - if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { - layer_end = svc->number_temporal_layers; - } else { - layer_end = svc->number_spatial_layers; - - if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { - if (vp9_realloc_frame_buffer(&cpi->svc.empty_frame.img, - cpi->common.width, cpi->common.height, - cpi->common.subsampling_x, - cpi->common.subsampling_y, + if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { + if (vp9_realloc_frame_buffer(&cpi->svc.empty_frame.img, + SMALL_FRAME_WIDTH, SMALL_FRAME_HEIGHT, + cpi->common.subsampling_x, + cpi->common.subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH cpi->common.use_highbitdepth, #endif VP9_ENC_BORDER_IN_PIXELS, cpi->common.byte_alignment, NULL, NULL, NULL)) - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, - "Failed to allocate empty frame for multiple frame " - "contexts"); - - memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80, - cpi->svc.empty_frame.img.buffer_alloc_sz); - cpi->svc.empty_frame_width = cpi->common.width; - cpi->svc.empty_frame_height = cpi->common.height; - } + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate empty frame for multiple frame " + "contexts"); + + memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80, + cpi->svc.empty_frame.img.buffer_alloc_sz); } - for (layer = 0; layer < layer_end; ++layer) { - LAYER_CONTEXT *const lc = &svc->layer_context[layer]; - RATE_CONTROL *const lrc = &lc->rc; - int i; - lc->current_video_frame_in_layer = 0; - lc->layer_size = 0; - lc->frames_from_key_frame = 0; - lc->last_frame_type = FRAME_TYPES; - lrc->ni_av_qi = oxcf->worst_allowed_q; - lrc->total_actual_bits = 0; - lrc->total_target_vs_actual = 0; - lrc->ni_tot_qi = 0; - lrc->tot_q = 0.0; - lrc->avg_q = 0.0; - lrc->ni_frames = 0; - lrc->decimation_count = 0; - lrc->decimation_factor = 0; - - for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { - lrc->rate_correction_factors[i] = 1.0; - } + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + int i; + lc->current_video_frame_in_layer = 0; + lc->layer_size = 0; + lc->frames_from_key_frame = 0; + lc->last_frame_type = FRAME_TYPES; + lrc->ni_av_qi = oxcf->worst_allowed_q; + lrc->total_actual_bits = 0; + lrc->total_target_vs_actual = 0; + lrc->ni_tot_qi = 0; + lrc->tot_q = 0.0; + lrc->avg_q = 0.0; + lrc->ni_frames = 0; + lrc->decimation_count = 0; + lrc->decimation_factor = 0; + + for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { + lrc->rate_correction_factors[i] = 1.0; + } - if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { - lc->target_bandwidth = oxcf->ts_target_bitrate[layer]; - lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q; - lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; - lrc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q; - } else { - lc->target_bandwidth = oxcf->ss_target_bitrate[layer]; - lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q; - lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q; - lrc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q + - oxcf->best_allowed_q) / 2; - lrc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q + + if (cpi->oxcf.rc_mode == VPX_CBR) { + lc->target_bandwidth = oxcf->layer_target_bitrate[layer]; + lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q; + lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; + lrc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q; + } else { + lc->target_bandwidth = oxcf->layer_target_bitrate[layer]; + lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q; + lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q; + lrc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2; - if (oxcf->ss_enable_auto_arf[layer]) - lc->alt_ref_idx = alt_ref_idx++; - else - lc->alt_ref_idx = INVALID_IDX; - lc->gold_ref_idx = INVALID_IDX; - } + lrc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q + + oxcf->best_allowed_q) / 2; + if (oxcf->ss_enable_auto_arf[sl]) + lc->alt_ref_idx = alt_ref_idx++; + else + lc->alt_ref_idx = INVALID_IDX; + lc->gold_ref_idx = INVALID_IDX; + } - lrc->buffer_level = oxcf->starting_buffer_level_ms * - lc->target_bandwidth / 1000; - lrc->bits_off_target = lrc->buffer_level; + lrc->buffer_level = oxcf->starting_buffer_level_ms * + lc->target_bandwidth / 1000; + lrc->bits_off_target = lrc->buffer_level; + } } // Still have extra buffer for base layer golden frame @@ -112,53 +108,98 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, SVC *const svc = &cpi->svc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; const RATE_CONTROL *const rc = &cpi->rc; - int layer; - int layer_end; + int sl, tl, layer = 0, spatial_layer_target; float bitrate_alloc = 1.0; - if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { - layer_end = svc->number_temporal_layers; - } else { - layer_end = svc->number_spatial_layers; - } + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + spatial_layer_target = 0; - for (layer = 0; layer < layer_end; ++layer) { - LAYER_CONTEXT *const lc = &svc->layer_context[layer]; - RATE_CONTROL *const lrc = &lc->rc; + for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { + layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers); + svc->layer_context[layer].target_bandwidth = + oxcf->layer_target_bitrate[layer]; + } - if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { - lc->target_bandwidth = oxcf->ts_target_bitrate[layer]; - } else { - lc->target_bandwidth = oxcf->ss_target_bitrate[layer]; + layer = LAYER_IDS_TO_IDX(sl, ((oxcf->ts_number_layers - 1) < 0 ? + 0 : (oxcf->ts_number_layers - 1)), oxcf->ts_number_layers); + spatial_layer_target = + svc->layer_context[layer].target_bandwidth = + oxcf->layer_target_bitrate[layer]; + + for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { + LAYER_CONTEXT *const lc = + &svc->layer_context[sl * oxcf->ts_number_layers + tl]; + RATE_CONTROL *const lrc = &lc->rc; + + lc->spatial_layer_target_bandwidth = spatial_layer_target; + bitrate_alloc = (float)lc->target_bandwidth / spatial_layer_target; + lrc->starting_buffer_level = + (int64_t)(rc->starting_buffer_level * bitrate_alloc); + lrc->optimal_buffer_level = + (int64_t)(rc->optimal_buffer_level * bitrate_alloc); + lrc->maximum_buffer_size = + (int64_t)(rc->maximum_buffer_size * bitrate_alloc); + lrc->bits_off_target = + MIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size); + lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl]; + lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = rc->max_frame_bandwidth; + lrc->worst_quality = rc->worst_quality; + lrc->best_quality = rc->best_quality; + } } - bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; - // Update buffer-related quantities. - lrc->starting_buffer_level = - (int64_t)(rc->starting_buffer_level * bitrate_alloc); - lrc->optimal_buffer_level = - (int64_t)(rc->optimal_buffer_level * bitrate_alloc); - lrc->maximum_buffer_size = - (int64_t)(rc->maximum_buffer_size * bitrate_alloc); - lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size); - lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size); - // Update framerate-related quantities. + } else { + int layer_end; + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { - lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer]; + layer_end = svc->number_temporal_layers; } else { - lc->framerate = cpi->framerate; + layer_end = svc->number_spatial_layers; + } + + for (layer = 0; layer < layer_end; ++layer) { + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + + lc->target_bandwidth = oxcf->layer_target_bitrate[layer]; + + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + // Update buffer-related quantities. + lrc->starting_buffer_level = + (int64_t)(rc->starting_buffer_level * bitrate_alloc); + lrc->optimal_buffer_level = + (int64_t)(rc->optimal_buffer_level * bitrate_alloc); + lrc->maximum_buffer_size = + (int64_t)(rc->maximum_buffer_size * bitrate_alloc); + lrc->bits_off_target = MIN(lrc->bits_off_target, + lrc->maximum_buffer_size); + lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size); + // Update framerate-related quantities. + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { + lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer]; + } else { + lc->framerate = cpi->framerate; + } + lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = rc->max_frame_bandwidth; + // Update qp-related quantities. + lrc->worst_quality = rc->worst_quality; + lrc->best_quality = rc->best_quality; } - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); - lrc->max_frame_bandwidth = rc->max_frame_bandwidth; - // Update qp-related quantities. - lrc->worst_quality = rc->worst_quality; - lrc->best_quality = rc->best_quality; } } static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) { - return (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ? - &cpi->svc.layer_context[cpi->svc.temporal_layer_id] : - &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; + if (is_one_pass_cbr_svc(cpi)) + return &cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id]; + else + return (cpi->svc.number_temporal_layers > 1 && + cpi->oxcf.rc_mode == VPX_CBR) ? + &cpi->svc.layer_context[cpi->svc.temporal_layer_id] : + &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; } void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { @@ -166,18 +207,22 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; LAYER_CONTEXT *const lc = get_layer_context(cpi); RATE_CONTROL *const lrc = &lc->rc; - const int layer = svc->temporal_layer_id; + // Index into spatial+temporal arrays. + const int st_idx = svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id; + const int tl = svc->temporal_layer_id; - lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer]; + lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl]; lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; // Update the average layer frame size (non-cumulative per-frame-bw). - if (layer == 0) { + if (tl == 0) { lc->avg_frame_size = lrc->avg_frame_bandwidth; } else { const double prev_layer_framerate = - cpi->framerate / oxcf->ts_rate_decimator[layer - 1]; - const int prev_layer_target_bandwidth = oxcf->ts_target_bitrate[layer - 1]; + cpi->framerate / oxcf->ts_rate_decimator[tl - 1]; + const int prev_layer_target_bandwidth = + oxcf->layer_target_bitrate[st_idx - 1]; lc->avg_frame_size = (int)((lc->target_bandwidth - prev_layer_target_bandwidth) / (lc->framerate - prev_layer_framerate)); @@ -243,9 +288,8 @@ void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { LAYER_CONTEXT *const lc = - (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ? - &cpi->svc.layer_context[cpi->svc.temporal_layer_id] : - &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; + &cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers]; ++lc->current_video_frame_in_layer; ++lc->frames_from_key_frame; } @@ -253,10 +297,11 @@ void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) { return is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0 && - cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame; + cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id].is_key_frame; } -#if CONFIG_SPATIAL_SVC static void get_layer_resolution(const int width_org, const int height_org, const int num, const int den, int *width_out, int *height_out) { @@ -276,6 +321,201 @@ static void get_layer_resolution(const int width_org, const int height_org, *height_out = h; } +// The function sets proper ref_frame_flags, buffer indices, and buffer update +// variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering +// scheme. +static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) { + int frame_num_within_temporal_struct = 0; + int spatial_id, temporal_id; + spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; + frame_num_within_temporal_struct = + cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers].current_video_frame_in_layer % 4; + temporal_id = cpi->svc.temporal_layer_id = + (frame_num_within_temporal_struct & 1) ? 2 : + (frame_num_within_temporal_struct >> 1); + cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame = + cpi->ext_refresh_alt_ref_frame = 0; + if (!temporal_id) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_last_frame = 1; + if (!spatial_id) { + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else if (cpi->svc.layer_context[temporal_id].is_key_frame) { + // base layer is a key frame. + cpi->ref_frame_flags = VP9_GOLD_FLAG; + } else { + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + } else if (temporal_id == 1) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_alt_ref_frame = 1; + if (!spatial_id) { + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else { + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + } else { + if (frame_num_within_temporal_struct == 1) { + // the first tl2 picture + if (!spatial_id) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_alt_ref_frame = 1; + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else if (spatial_id < cpi->svc.number_spatial_layers - 1) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_alt_ref_frame = 1; + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } else { // Top layer + cpi->ext_refresh_frame_flags_pending = 0; + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + } else { + // The second tl2 picture + if (!spatial_id) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ref_frame_flags = VP9_LAST_FLAG; + cpi->ext_refresh_last_frame = 1; + } else if (spatial_id < cpi->svc.number_spatial_layers - 1) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + cpi->ext_refresh_last_frame = 1; + } else { // top layer + cpi->ext_refresh_frame_flags_pending = 0; + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + } + } + if (temporal_id == 0) { + cpi->lst_fb_idx = spatial_id; + if (spatial_id) + cpi->gld_fb_idx = spatial_id - 1; + else + cpi->gld_fb_idx = 0; + cpi->alt_fb_idx = 0; + } else if (temporal_id == 1) { + cpi->lst_fb_idx = spatial_id; + cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; + cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; + } else if (frame_num_within_temporal_struct == 1) { + cpi->lst_fb_idx = spatial_id; + cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; + cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; + } else { + cpi->lst_fb_idx = cpi->svc.number_spatial_layers + spatial_id; + cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; + cpi->alt_fb_idx = 0; + } +} + +// The function sets proper ref_frame_flags, buffer indices, and buffer update +// variables for temporal layering mode 2 - that does 0-1-0-1 temporal layering +// scheme. +static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) { + int spatial_id, temporal_id; + spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; + temporal_id = cpi->svc.temporal_layer_id = + cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers].current_video_frame_in_layer & 1; + cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame = + cpi->ext_refresh_alt_ref_frame = 0; + if (!temporal_id) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_last_frame = 1; + if (!spatial_id) { + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else if (cpi->svc.layer_context[temporal_id].is_key_frame) { + // base layer is a key frame. + cpi->ref_frame_flags = VP9_GOLD_FLAG; + } else { + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + } else if (temporal_id == 1) { + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_alt_ref_frame = 1; + if (!spatial_id) { + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else { + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + } + + if (temporal_id == 0) { + cpi->lst_fb_idx = spatial_id; + if (spatial_id) + cpi->gld_fb_idx = spatial_id - 1; + else + cpi->gld_fb_idx = 0; + cpi->alt_fb_idx = 0; + } else if (temporal_id == 1) { + cpi->lst_fb_idx = spatial_id; + cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; + cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; + } +} + +// The function sets proper ref_frame_flags, buffer indices, and buffer update +// variables for temporal layering mode 0 - that has no temporal layering. +static void set_flags_and_fb_idx_for_temporal_mode_noLayering( + VP9_COMP *const cpi) { + int spatial_id; + spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; + cpi->ext_refresh_last_frame = + cpi->ext_refresh_golden_frame = cpi->ext_refresh_alt_ref_frame = 0; + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_last_frame = 1; + if (!spatial_id) { + cpi->ref_frame_flags = VP9_LAST_FLAG; + } else if (cpi->svc.layer_context[0].is_key_frame) { + cpi->ref_frame_flags = VP9_GOLD_FLAG; + } else { + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + cpi->lst_fb_idx = spatial_id; + if (spatial_id) + cpi->gld_fb_idx = spatial_id - 1; + else + cpi->gld_fb_idx = 0; +} + +int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { + int width = 0, height = 0; + LAYER_CONTEXT *lc = NULL; + + if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { + set_flags_and_fb_idx_for_temporal_mode3(cpi); + } else if (cpi->svc.temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { + set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi); + } else if (cpi->svc.temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_0101) { + set_flags_and_fb_idx_for_temporal_mode2(cpi); + } else if (cpi->svc.temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + // VP9E_TEMPORAL_LAYERING_MODE_BYPASS : + // if the code goes here, it means the encoder will be relying on the + // flags from outside for layering. + // However, since when spatial+temporal layering is used, the buffer indices + // cannot be derived automatically, the bypass mode will only work when the + // number of spatial layers equals 1. + assert(cpi->svc.number_spatial_layers == 1); + } + + lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id]; + + get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, + lc->scaling_factor_num, lc->scaling_factor_den, + &width, &height); + + if (vp9_set_size_literal(cpi, width, height) != 0) + return VPX_CODEC_INVALID_PARAM; + + return 0; +} + +#if CONFIG_SPATIAL_SVC int vp9_svc_start_frame(VP9_COMP *const cpi) { int width = 0, height = 0; LAYER_CONTEXT *lc; @@ -362,20 +602,11 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) { cpi->lst_fb_idx = cpi->gld_fb_idx = cpi->alt_fb_idx = SMALL_FRAME_FB_IDX; - // Gradually make the empty frame smaller to save bits. Make it half of - // its previous size because of the scaling factor restriction. - cpi->svc.empty_frame_width >>= 1; - cpi->svc.empty_frame_width = (cpi->svc.empty_frame_width + 1) & ~1; - if (cpi->svc.empty_frame_width < 16) - cpi->svc.empty_frame_width = 16; + if (cpi->svc.encode_intra_empty_frame != 0) + cpi->common.intra_only = 1; - cpi->svc.empty_frame_height >>= 1; - cpi->svc.empty_frame_height = (cpi->svc.empty_frame_height + 1) & ~1; - if (cpi->svc.empty_frame_height < 16) - cpi->svc.empty_frame_height = 16; - - width = cpi->svc.empty_frame_width; - height = cpi->svc.empty_frame_height; + width = SMALL_FRAME_WIDTH; + height = SMALL_FRAME_HEIGHT; } } } @@ -395,11 +626,12 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) { return 0; } +#endif + struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi, struct lookahead_ctx *ctx, int drain) { struct lookahead_entry *buf = NULL; - if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) { buf = vp9_lookahead_peek(ctx, 0); if (buf != NULL) { @@ -409,7 +641,5 @@ struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi, } } } - return buf; } -#endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h index e9645ce9f24..b6a5ea54835 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -22,6 +22,7 @@ extern "C" { typedef struct { RATE_CONTROL rc; int target_bandwidth; + int spatial_layer_target_bandwidth; // Target for the spatial layer. double framerate; int avg_frame_size; int max_q; @@ -57,17 +58,18 @@ typedef struct { NEED_TO_ENCODE }encode_empty_frame_state; struct lookahead_entry empty_frame; - int empty_frame_width; - int empty_frame_height; + int encode_intra_empty_frame; // Store scaled source frames to be used for temporal filter to generate // a alt ref frame. YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS]; // Layer context used for rate control in one pass temporal CBR mode or - // two pass spatial mode. Defined for temporal or spatial layers for now. - // Does not support temporal combined with spatial RC. - LAYER_CONTEXT layer_context[MAX(VPX_TS_MAX_LAYERS, VPX_SS_MAX_LAYERS)]; + // two pass spatial mode. + LAYER_CONTEXT layer_context[VPX_MAX_LAYERS]; + // Indicates what sort of temporal layering is used. + // Currently, this only works for CBR mode. + VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode; } SVC; struct VP9_COMP; @@ -111,6 +113,8 @@ struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi, // Start a frame and initialize svc parameters int vp9_svc_start_frame(struct VP9_COMP *const cpi); +int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c index d7979ab53a5..24b6203cb66 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -23,7 +23,9 @@ #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_segmentation.h" +#include "vp9/encoder/vp9_temporal_filter.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vpx_ports/vpx_timer.h" #include "vpx_scale/vpx_scale.h" @@ -109,7 +111,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, kernel, mv_precision_uv, x, y); } -void vp9_temporal_filter_init() { +void vp9_temporal_filter_init(void) { int i; fixed_divide[0] = 0; @@ -680,7 +682,7 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { if (frames_to_blur > 0) { // Setup scaling factors. Scaling on each of the arnr frames is not // supported. - if (is_two_pass_svc(cpi)) { + if (cpi->use_svc) { // In spatial svc the scaling factors might be less then 1/2. // So we will use non-normative scaling. int frame_used = 0; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.h index a971e0ae365..f537b8870a6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.h @@ -15,7 +15,7 @@ extern "C" { #endif -void vp9_temporal_filter_init(); +void vp9_temporal_filter_init(void); void vp9_temporal_filter(VP9_COMP *cpi, int distance); #ifdef __cplusplus diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c index 862be4d384a..181a99ce880 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c @@ -17,6 +17,7 @@ #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_cost.h" @@ -483,7 +484,7 @@ static INLINE void add_token_no_extra(TOKENEXTRA **t, static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id, TX_SIZE tx_size) { const int eob_max = 16 << (tx_size << 1); - return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; + return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; } static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, @@ -614,8 +615,8 @@ void vp9_tokenize_sb(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; const int ctx = vp9_get_skip_context(xd); - const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id, - SEG_LVL_SKIP); + const int skip_inc = !segfeature_active(&cm->seg, mbmi->segment_id, + SEG_LVL_SKIP); struct tokenize_b_args arg = {cpi, td, t}; if (mbmi->skip) { if (!dry_run) diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c index f38f96d6c26..c571b7c9545 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c @@ -9,6 +9,7 @@ */ #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" @@ -18,25 +19,16 @@ #include "vp9/encoder/vp9_variance.h" -void variance(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, unsigned int *sse, int *sum) { - int i, j; - - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - - a += a_stride; - b += b_stride; - } -} +static uint8_t bilinear_filters[8][2] = { + { 128, 0, }, + { 112, 16, }, + { 96, 32, }, + { 80, 48, }, + { 64, 64, }, + { 48, 80, }, + { 32, 96, }, + { 16, 112, }, +}; // Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal // or vertical direction to produce the filtered output block. Used to implement @@ -52,7 +44,7 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { + const uint8_t *vp9_filter) { unsigned int i, j; for (i = 0; i < output_height; i++) { @@ -84,7 +76,7 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { + const uint8_t *vp9_filter) { unsigned int i, j; for (i = 0; i < output_height; i++) { @@ -100,25 +92,6 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, } } -unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { - unsigned int i, sum = 0; - - for (i = 0; i < 256; ++i) { - sum += src_ptr[i] * src_ptr[i]; - } - - return sum; -} - -#define VAR(W, H) \ -unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - unsigned int *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} - #define SUBPIX_VAR(W, H) \ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, \ @@ -129,11 +102,11 @@ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \ uint8_t temp2[H * W]; \ \ var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ - BILINEAR_FILTERS_2TAP(xoffset)); \ + bilinear_filters[xoffset]); \ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ - return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \ + return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \ } #define SUBPIX_AVG_VAR(W, H) \ @@ -148,182 +121,55 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ \ var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ - BILINEAR_FILTERS_2TAP(xoffset)); \ + bilinear_filters[xoffset]); \ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ - vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ \ - return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \ -} - -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum); -} - -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum); -} - -unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum); - return *sse; -} - -unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum); - return *sse; -} - -unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum); - return *sse; -} - -unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum); - return *sse; + return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \ } -VAR(4, 4) SUBPIX_VAR(4, 4) SUBPIX_AVG_VAR(4, 4) -VAR(4, 8) SUBPIX_VAR(4, 8) SUBPIX_AVG_VAR(4, 8) -VAR(8, 4) SUBPIX_VAR(8, 4) SUBPIX_AVG_VAR(8, 4) -VAR(8, 8) SUBPIX_VAR(8, 8) SUBPIX_AVG_VAR(8, 8) -VAR(8, 16) SUBPIX_VAR(8, 16) SUBPIX_AVG_VAR(8, 16) -VAR(16, 8) SUBPIX_VAR(16, 8) SUBPIX_AVG_VAR(16, 8) -VAR(16, 16) SUBPIX_VAR(16, 16) SUBPIX_AVG_VAR(16, 16) -VAR(16, 32) SUBPIX_VAR(16, 32) SUBPIX_AVG_VAR(16, 32) -VAR(32, 16) SUBPIX_VAR(32, 16) SUBPIX_AVG_VAR(32, 16) -VAR(32, 32) SUBPIX_VAR(32, 32) SUBPIX_AVG_VAR(32, 32) -VAR(32, 64) SUBPIX_VAR(32, 64) SUBPIX_AVG_VAR(32, 64) -VAR(64, 32) SUBPIX_VAR(64, 32) SUBPIX_AVG_VAR(64, 32) -VAR(64, 64) SUBPIX_VAR(64, 64) SUBPIX_AVG_VAR(64, 64) -void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride) { - int i, j; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - const int tmp = pred[j] + ref[j]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } -} - #if CONFIG_VP9_HIGHBITDEPTH -void highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, uint64_t *sse, - uint64_t *sum) { - int i, j; - - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - a += a_stride; - b += b_stride; - } -} - -void highbd_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, - int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; -} - -void highbd_10_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, - int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4); -} - -void highbd_12_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, - int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8); -} - static void highbd_var_filter_block2d_bil_first_pass( const uint8_t *src_ptr8, uint16_t *output_ptr, @@ -331,7 +177,7 @@ static void highbd_var_filter_block2d_bil_first_pass( int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { + const uint8_t *vp9_filter) { unsigned int i, j; uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); for (i = 0; i < output_height; i++) { @@ -357,7 +203,7 @@ static void highbd_var_filter_block2d_bil_second_pass( unsigned int pixel_step, unsigned int output_height, unsigned int output_width, - const int16_t *vp9_filter) { + const uint8_t *vp9_filter) { unsigned int i, j; for (i = 0; i < output_height; i++) { @@ -374,35 +220,6 @@ static void highbd_var_filter_block2d_bil_second_pass( } } -#define HIGHBD_VAR(W, H) \ -unsigned int vp9_highbd_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ -\ -unsigned int vp9_highbd_10_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ -\ -unsigned int vp9_highbd_12_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} - #define HIGHBD_SUBPIX_VAR(W, H) \ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, \ @@ -413,11 +230,11 @@ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \ uint16_t temp2[H * W]; \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ - return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ dst_stride, sse); \ } \ \ @@ -430,11 +247,11 @@ unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \ uint16_t temp2[H * W]; \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ - return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ W, dst, dst_stride, sse); \ } \ \ @@ -447,11 +264,11 @@ unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \ uint16_t temp2[H * W]; \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ - return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ W, dst, dst_stride, sse); \ } @@ -467,14 +284,14 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ - vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ - return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ dst_stride, sse); \ } \ \ @@ -489,14 +306,14 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ - vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ - return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ W, dst, dst_stride, sse); \ } \ \ @@ -511,141 +328,53 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, BILINEAR_FILTERS_2TAP(xoffset)); \ + W, bilinear_filters[xoffset]); \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - BILINEAR_FILTERS_2TAP(yoffset)); \ + bilinear_filters[yoffset]); \ \ - vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ - return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ W, dst, dst_stride, sse); \ } -#define HIGHBD_GET_VAR(S) \ -void vp9_highbd_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ - highbd_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} \ -\ -void vp9_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ - highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} \ -\ -void vp9_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ - highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} - -#define HIGHBD_MSE(W, H) \ -unsigned int vp9_highbd_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} \ -\ -unsigned int vp9_highbd_10_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} \ -\ -unsigned int vp9_highbd_12_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} - -HIGHBD_GET_VAR(8) -HIGHBD_GET_VAR(16) - -HIGHBD_MSE(16, 16) -HIGHBD_MSE(16, 8) -HIGHBD_MSE(8, 16) -HIGHBD_MSE(8, 8) - -HIGHBD_VAR(4, 4) HIGHBD_SUBPIX_VAR(4, 4) HIGHBD_SUBPIX_AVG_VAR(4, 4) -HIGHBD_VAR(4, 8) HIGHBD_SUBPIX_VAR(4, 8) HIGHBD_SUBPIX_AVG_VAR(4, 8) -HIGHBD_VAR(8, 4) HIGHBD_SUBPIX_VAR(8, 4) HIGHBD_SUBPIX_AVG_VAR(8, 4) -HIGHBD_VAR(8, 8) HIGHBD_SUBPIX_VAR(8, 8) HIGHBD_SUBPIX_AVG_VAR(8, 8) -HIGHBD_VAR(8, 16) HIGHBD_SUBPIX_VAR(8, 16) HIGHBD_SUBPIX_AVG_VAR(8, 16) -HIGHBD_VAR(16, 8) HIGHBD_SUBPIX_VAR(16, 8) HIGHBD_SUBPIX_AVG_VAR(16, 8) -HIGHBD_VAR(16, 16) HIGHBD_SUBPIX_VAR(16, 16) HIGHBD_SUBPIX_AVG_VAR(16, 16) -HIGHBD_VAR(16, 32) HIGHBD_SUBPIX_VAR(16, 32) HIGHBD_SUBPIX_AVG_VAR(16, 32) -HIGHBD_VAR(32, 16) HIGHBD_SUBPIX_VAR(32, 16) HIGHBD_SUBPIX_AVG_VAR(32, 16) -HIGHBD_VAR(32, 32) HIGHBD_SUBPIX_VAR(32, 32) HIGHBD_SUBPIX_AVG_VAR(32, 32) -HIGHBD_VAR(32, 64) HIGHBD_SUBPIX_VAR(32, 64) HIGHBD_SUBPIX_AVG_VAR(32, 64) -HIGHBD_VAR(64, 32) HIGHBD_SUBPIX_VAR(64, 32) HIGHBD_SUBPIX_AVG_VAR(64, 32) -HIGHBD_VAR(64, 64) HIGHBD_SUBPIX_VAR(64, 64) HIGHBD_SUBPIX_AVG_VAR(64, 64) - -void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, - int ref_stride) { - int i, j; - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - const int tmp = pred[j] + ref[j]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } -} #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.h index 53148f23c56..0a8739510f4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.h @@ -12,33 +12,12 @@ #define VP9_ENCODER_VP9_VARIANCE_H_ #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" #ifdef __cplusplus extern "C" { #endif -void variance(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, - unsigned int *sse, int *sum); - -#if CONFIG_VP9_HIGHBITDEPTH -void highbd_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, - unsigned int *sse, int *sum); - -void highbd_10_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, - unsigned int *sse, int *sum); - -void highbd_12_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, - unsigned int *sse, int *sum); -#endif - typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -95,15 +74,6 @@ typedef struct vp9_variance_vtable { vp9_sad_multi_d_fn_t sdx4df; } vp9_variance_fn_ptr_t; -void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride); - -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred, - int width, int height, - const uint8_t *ref, int ref_stride); -#endif - #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.h index 9d161f95cf6..e347ea41441 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.h @@ -19,7 +19,7 @@ extern "C" { #endif -typedef struct { +typedef struct vp9_writer { unsigned int lowvalue; unsigned int range; int count; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c index 4672aa6b8cf..4531d794a9c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -9,6 +9,8 @@ */ #include <emmintrin.h> + +#include "./vp9_rtcd.h" #include "vpx_ports/mem.h" void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, @@ -262,17 +264,18 @@ void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, __m128i b2 = _mm_add_epi16(coeff2, coeff3); __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + b0 = _mm_srai_epi16(b0, 1); + b1 = _mm_srai_epi16(b1, 1); + b2 = _mm_srai_epi16(b2, 1); + b3 = _mm_srai_epi16(b3, 1); + coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); - coeff0 = _mm_srai_epi16(coeff0, 1); - coeff1 = _mm_srai_epi16(coeff1, 1); _mm_store_si128((__m128i *)coeff, coeff0); _mm_store_si128((__m128i *)(coeff + 64), coeff1); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); - coeff2 = _mm_srai_epi16(coeff2, 1); - coeff3 = _mm_srai_epi16(coeff3, 1); _mm_store_si128((__m128i *)(coeff + 128), coeff2); _mm_store_si128((__m128i *)(coeff + 192), coeff3); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2_impl.h index 66827ad8037..ae6bfe5fa2d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2_impl.h @@ -9,6 +9,8 @@ */ #include <immintrin.h> // AVX2 + +#include "./vp9_rtcd.h" #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h index 099993aa6a0..003ebd13fe3 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h @@ -9,6 +9,8 @@ */ #include <emmintrin.h> // SSE2 + +#include "./vp9_rtcd.h" #include "vp9/common/vp9_idct.h" // for cospi constants #include "vp9/encoder/x86/vp9_dct_sse2.h" #include "vp9/encoder/vp9_dct.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c index 3a19f52746c..8f3b61ad86d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c @@ -15,12 +15,12 @@ #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2 #define FDCT32x32_HIGH_PRECISION 0 -#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" +#include "vp9/encoder/x86/vp9_dct32x32_avx2_impl.h" #undef FDCT32x32_2D_AVX2 #undef FDCT32x32_HIGH_PRECISION #define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2 #define FDCT32x32_HIGH_PRECISION 1 -#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT +#include "vp9/encoder/x86/vp9_dct32x32_avx2_impl.h" // NOLINT #undef FDCT32x32_2D_AVX2 #undef FDCT32x32_HIGH_PRECISION diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c index 564b7955e5b..cff4fcbdce0 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c @@ -10,6 +10,8 @@ #include <assert.h> #include <emmintrin.h> // SSE2 + +#include "./vp9_rtcd.h" #include "vp9/common/vp9_idct.h" // for cospi constants #include "vp9/encoder/vp9_dct.h" #include "vp9/encoder/x86/vp9_dct_sse2.h" @@ -96,7 +98,7 @@ static INLINE void transpose_4x4(__m128i *res) { res[3] = _mm_unpackhi_epi64(res[2], res[2]); } -void fdct4_sse2(__m128i *in) { +static void fdct4_sse2(__m128i *in) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); @@ -129,7 +131,7 @@ void fdct4_sse2(__m128i *in) { transpose_4x4(in); } -void fadst4_sse2(__m128i *in) { +static void fadst4_sse2(__m128i *in) { const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); @@ -831,7 +833,7 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { // 07 17 27 37 47 57 67 77 } -void fdct8_sse2(__m128i *in) { +static void fdct8_sse2(__m128i *in) { // constants const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); @@ -971,7 +973,7 @@ void fdct8_sse2(__m128i *in) { array_transpose_8x8(in, in); } -void fadst8_sse2(__m128i *in) { +static void fadst8_sse2(__m128i *in) { // Constants const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); @@ -1353,7 +1355,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { right_shift_8x8(res1 + 8, 2); } -void fdct16_8col(__m128i *in) { +static void fdct16_8col(__m128i *in) { // perform 16x16 1-D DCT for 8 columns __m128i i[8], s[8], p[8], t[8], u[16], v[16]; const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); @@ -1675,7 +1677,7 @@ void fdct16_8col(__m128i *in) { in[15] = _mm_packs_epi32(v[14], v[15]); } -void fadst16_8col(__m128i *in) { +static void fadst16_8col(__m128i *in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -2145,13 +2147,13 @@ void fadst16_8col(__m128i *in) { in[15] = _mm_sub_epi16(kZero, s[1]); } -void fdct16_sse2(__m128i *in0, __m128i *in1) { +static void fdct16_sse2(__m128i *in0, __m128i *in1) { fdct16_8col(in0); fdct16_8col(in1); array_transpose_16x16(in0, in1); } -void fadst16_sse2(__m128i *in0, __m128i *in1) { +static void fadst16_sse2(__m128i *in0, __m128i *in1) { fadst16_8col(in0); fadst16_8col(in1); array_transpose_16x16(in0, in1); @@ -2334,7 +2336,7 @@ void vp9_highbd_fht8x8_sse2(const int16_t *input, tran_low_t *output, } } -void vp9_highbd_fht16x16_sse2(int16_t *input, tran_low_t *output, +void vp9_highbd_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type) { if (tx_type == DCT_DCT) { vp9_highbd_fdct16x16_sse2(input, output, stride); @@ -2368,8 +2370,8 @@ void vp9_highbd_fht16x16_sse2(int16_t *input, tran_low_t *output, /* * The DCTnxn functions are defined using the macros below. The main code for - * them is in separate files (vp9/encoder/x86/vp9_dct_impl_sse2.c & - * vp9/encoder/x86/vp9_dct32x32_sse2.c) which are used by both the 8 bit code + * them is in separate files (vp9/encoder/x86/vp9_dct_sse2_impl.h & + * vp9/encoder/x86/vp9_dct32x32_sse2_impl.h) which are used by both the 8 bit code * and the high bit depth code. */ @@ -2378,20 +2380,20 @@ void vp9_highbd_fht16x16_sse2(int16_t *input, tran_low_t *output, #define FDCT4x4_2D vp9_fdct4x4_sse2 #define FDCT8x8_2D vp9_fdct8x8_sse2 #define FDCT16x16_2D vp9_fdct16x16_sse2 -#include "vp9/encoder/x86/vp9_dct_impl_sse2.c" +#include "vp9/encoder/x86/vp9_dct_sse2_impl.h" #undef FDCT4x4_2D #undef FDCT8x8_2D #undef FDCT16x16_2D #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 #define FDCT32x32_HIGH_PRECISION 0 -#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" +#include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" #undef FDCT32x32_2D #undef FDCT32x32_HIGH_PRECISION #define FDCT32x32_2D vp9_fdct32x32_sse2 #define FDCT32x32_HIGH_PRECISION 1 -#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT +#include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT #undef FDCT32x32_2D #undef FDCT32x32_HIGH_PRECISION @@ -2405,20 +2407,20 @@ void vp9_highbd_fht16x16_sse2(int16_t *input, tran_low_t *output, #define FDCT4x4_2D vp9_highbd_fdct4x4_sse2 #define FDCT8x8_2D vp9_highbd_fdct8x8_sse2 #define FDCT16x16_2D vp9_highbd_fdct16x16_sse2 -#include "vp9/encoder/x86/vp9_dct_impl_sse2.c" // NOLINT +#include "vp9/encoder/x86/vp9_dct_sse2_impl.h" // NOLINT #undef FDCT4x4_2D #undef FDCT8x8_2D #undef FDCT16x16_2D #define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2 #define FDCT32x32_HIGH_PRECISION 0 -#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT +#include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT #undef FDCT32x32_2D #undef FDCT32x32_HIGH_PRECISION #define FDCT32x32_2D vp9_highbd_fdct32x32_sse2 #define FDCT32x32_HIGH_PRECISION 1 -#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT +#include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT #undef FDCT32x32_2D #undef FDCT32x32_HIGH_PRECISION diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2_impl.h index e03a76d2e89..11bf5a25e62 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2_impl.h @@ -9,6 +9,8 @@ */ #include <emmintrin.h> // SSE2 + +#include "./vp9_rtcd.h" #include "vp9/common/vp9_idct.h" // for cospi constants #include "vp9/encoder/vp9_dct.h" #include "vp9/encoder/x86/vp9_dct_sse2.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c index 1c1005aeeda..96038fee16b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c @@ -15,6 +15,8 @@ #include <math.h> #endif #include <tmmintrin.h> // SSSE3 + +#include "./vp9_rtcd.h" #include "vp9/common/x86/vp9_idct_intrin_sse2.h" void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c index c67490fad34..dfebaab0ac6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c @@ -9,8 +9,9 @@ */ #include <immintrin.h> // AVX2 -#include "vpx/vpx_integer.h" +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c index ffa43b65a59..cbdd1c93e1c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c @@ -10,6 +10,7 @@ #include <emmintrin.h> +#include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #if CONFIG_VP9_HIGHBITDEPTH diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm index 987729f962c..4594bb1aabd 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm @@ -14,35 +14,19 @@ SECTION_RODATA pw_8: times 8 dw 8 bilin_filter_m_sse2: times 8 dw 16 times 8 dw 0 - times 8 dw 15 - times 8 dw 1 times 8 dw 14 times 8 dw 2 - times 8 dw 13 - times 8 dw 3 times 8 dw 12 times 8 dw 4 - times 8 dw 11 - times 8 dw 5 times 8 dw 10 times 8 dw 6 - times 8 dw 9 - times 8 dw 7 times 16 dw 8 - times 8 dw 7 - times 8 dw 9 times 8 dw 6 times 8 dw 10 - times 8 dw 5 - times 8 dw 11 times 8 dw 4 times 8 dw 12 - times 8 dw 3 - times 8 dw 13 times 8 dw 2 times 8 dw 14 - times 8 dw 1 - times 8 dw 15 SECTION .text diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c index 4bc3e7e2d15..29b7b278217 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c @@ -13,237 +13,6 @@ #include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" -typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -static void highbd_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } -} - -static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - uint64_t sse_long = 0; - int64_t sum_long = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - sse_long += sse0; - sum_long += sum0; - } - } - *sum = ROUND_POWER_OF_TWO(sum_long, 2); - *sse = ROUND_POWER_OF_TWO(sse_long, 4); -} - -static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - uint64_t sse_long = 0; - int64_t sum_long = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - sse_long += sse0; - sum_long += sum0; - } - } - *sum = ROUND_POWER_OF_TWO(sum_long, 4); - *sse = ROUND_POWER_OF_TWO(sse_long, 8); -} - - -#define HIGH_GET_VAR(S) \ -void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ -} \ -\ -void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ - *sum = ROUND_POWER_OF_TWO(*sum, 2); \ - *sse = ROUND_POWER_OF_TWO(*sse, 4); \ -} \ -\ -void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ - *sum = ROUND_POWER_OF_TWO(*sum, 4); \ - *sse = ROUND_POWER_OF_TWO(*sse, 8); \ -} - -HIGH_GET_VAR(16); -HIGH_GET_VAR(8); - -#undef HIGH_GET_VAR - -#define VAR_FN(w, h, block_size, shift) \ -uint32_t vp9_highbd_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vp9_highbd_calc##block_size##x##block_size##var_sse2, \ - block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ -} \ -\ -uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_10_variance_sse2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ -} \ -\ -uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_12_variance_sse2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ -} - -VAR_FN(64, 64, 16, 12); -VAR_FN(64, 32, 16, 11); -VAR_FN(32, 64, 16, 11); -VAR_FN(32, 32, 16, 10); -VAR_FN(32, 16, 16, 9); -VAR_FN(16, 32, 16, 9); -VAR_FN(16, 16, 16, 8); -VAR_FN(16, 8, 8, 7); -VAR_FN(8, 16, 8, 7); -VAR_FN(8, 8, 8, 6); - -#undef VAR_FN - -unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vp9_highbd_calc8x8var_sse2, 8); - return *sse; -} - -unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vp9_highbd_calc8x8var_sse2, 8); - return *sse; -} - -unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vp9_highbd_calc8x8var_sse2, 8); - return *sse; -} - #define DECL(w, opt) \ int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ ptrdiff_t src_stride, \ diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c index 00abd3c4962..71fdfd71624 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c @@ -11,6 +11,7 @@ #include <emmintrin.h> #include <xmmintrin.h> +#include "./vp9_rtcd.h" #include "vpx/vpx_integer.h" void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm index 06b8b034a5e..292cf34d1a2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm @@ -14,52 +14,28 @@ SECTION_RODATA pw_8: times 8 dw 8 bilin_filter_m_sse2: times 8 dw 16 times 8 dw 0 - times 8 dw 15 - times 8 dw 1 times 8 dw 14 times 8 dw 2 - times 8 dw 13 - times 8 dw 3 times 8 dw 12 times 8 dw 4 - times 8 dw 11 - times 8 dw 5 times 8 dw 10 times 8 dw 6 - times 8 dw 9 - times 8 dw 7 times 16 dw 8 - times 8 dw 7 - times 8 dw 9 times 8 dw 6 times 8 dw 10 - times 8 dw 5 - times 8 dw 11 times 8 dw 4 times 8 dw 12 - times 8 dw 3 - times 8 dw 13 times 8 dw 2 times 8 dw 14 - times 8 dw 1 - times 8 dw 15 bilin_filter_m_ssse3: times 8 db 16, 0 - times 8 db 15, 1 times 8 db 14, 2 - times 8 db 13, 3 times 8 db 12, 4 - times 8 db 11, 5 times 8 db 10, 6 - times 8 db 9, 7 times 16 db 8 - times 8 db 7, 9 times 8 db 6, 10 - times 8 db 5, 11 times 8 db 4, 12 - times 8 db 3, 13 times 8 db 2, 14 - times 8 db 1, 15 SECTION .text diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c index a441cadaf70..b1c79752076 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c @@ -9,42 +9,28 @@ */ #include <immintrin.h> // AVX2 + +#include "./vp9_rtcd.h" #include "vpx_ports/mem.h" #include "vp9/encoder/vp9_variance.h" DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, - 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, - 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, - 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, - 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, - 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, - 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, - 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, - 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, - 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15 }; #define FILTER_SRC(filter) \ diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c index ea09b959e12..8cd071de5e2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c @@ -7,23 +7,12 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" -typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - -void vp9_get16x16var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - -void vp9_get32x32var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, @@ -41,81 +30,6 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int height, unsigned int *sseptr); -static void variance_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - int w, int h, unsigned int *sse, int *sum, - get_var_avx2 var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += 16) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(&src[src_stride * i + j], src_stride, - &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } -} - - -unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_get16x16var_avx2, 16); - return *sse - (((unsigned int)sum * sum) >> 8); -} - -unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse; -} - -unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 16, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 9); -} - -unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 32, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 10); -} - -unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 64, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 12); -} - -unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 32, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 11); -} - unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, int src_stride, int x_offset, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c index 8490bbbdc2e..961efe34ee6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c @@ -10,310 +10,12 @@ #include <emmintrin.h> // SSE2 +#include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" -typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); - -unsigned int vp9_get_mb_ss_sse2(const int16_t *src) { - __m128i vsum = _mm_setzero_si128(); - int i; - - for (i = 0; i < 32; ++i) { - const __m128i v = _mm_loadu_si128((const __m128i *)src); - vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); - src += 8; - } - - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - return _mm_cvtsi128_si32(vsum); -} - -#define READ64(p, stride, i) \ - _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ - _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) - -unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); - const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); - const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); - const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - // sum - __m128i vsum = _mm_add_epi16(diff0, diff1); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), - _mm_madd_epi16(diff1, diff1)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - *sse = _mm_cvtsi128_si32(vsum); - - return 0; -} - -unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; - - for (i = 0; i < 8; i += 2) { - const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(src + i * src_stride)), zero); - const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(ref + i * ref_stride)), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - - const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(src + (i + 1) * src_stride)), zero); - const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(ref + (i + 1) * ref_stride)), zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - } - - // sum - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); - - return 0; -} - -unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; - - for (i = 0; i < 16; ++i) { - const __m128i s = _mm_loadu_si128((const __m128i *)src); - const __m128i r = _mm_loadu_si128((const __m128i *)ref); - - const __m128i src0 = _mm_unpacklo_epi8(s, zero); - const __m128i ref0 = _mm_unpacklo_epi8(r, zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - - const __m128i src1 = _mm_unpackhi_epi8(s, zero); - const __m128i ref1 = _mm_unpackhi_epi8(r, zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - - src += src_stride; - ref += ref_stride; - } - - // sum - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0) + - (int16_t)_mm_extract_epi16(vsum, 1); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); - - return 0; -} - - -static void variance_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - int w, int h, unsigned int *sse, int *sum, - variance_fn_t var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } -} - -unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 4); -} - -unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 4, - sse, &sum, vp9_get4x4var_sse2, 4); - return *sse - (((unsigned int)sum * sum) >> 5); -} - -unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 8, - sse, &sum, vp9_get4x4var_sse2, 4); - return *sse - (((unsigned int)sum * sum) >> 5); -} - -unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 6); -} - -unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 8, - sse, &sum, vp9_get8x8var_sse2, 8); - return *sse - (((unsigned int)sum * sum) >> 7); -} - -unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 16, - sse, &sum, vp9_get8x8var_sse2, 8); - return *sse - (((unsigned int)sum * sum) >> 7); -} - -unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 8); -} - -unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 32, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 10); -} - -unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 16, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 9); -} - -unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 32, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 9); -} - -unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 64, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 12); -} - -unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 32, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 11); -} - -unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 64, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 11); -} - -unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - // The 2 unused parameters are place holders for PIC enabled build. #define DECL(w, opt) \ int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk index c9326eeea69..6f091eefb63 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk +++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk @@ -69,6 +69,7 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h VP9_COMMON_SRCS-yes += common/vp9_scan.c VP9_COMMON_SRCS-yes += common/vp9_scan.h +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c @@ -131,14 +132,29 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_ds # common (msa) VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_horiz_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_vert_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_avg_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_copy_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_msa.h +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct_msa.h +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_intra_predict_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_4_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_8_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_msa.h + +ifeq ($(CONFIG_VP9_POSTPROC),yes) +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c +endif VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h @@ -197,8 +213,9 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c # TODO(johannkoenig): re-enable when chromium build is fixed # # https://code.google.com/p/chromium/issues/detail?id=443839 #VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_reconintra_neon.c + $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl)) diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c index cba15e693e9..9462be9faf1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c @@ -176,15 +176,23 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS); RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS); + if (cfg->ss_number_layers * cfg->ts_number_layers > VPX_MAX_LAYERS) + ERROR("ss_number_layers * ts_number_layers is out of range"); if (cfg->ts_number_layers > 1) { - unsigned int i; - for (i = 1; i < cfg->ts_number_layers; ++i) - if (cfg->ts_target_bitrate[i] < cfg->ts_target_bitrate[i - 1]) + unsigned int sl, tl; + for (sl = 1; sl < cfg->ss_number_layers; ++sl) { + for (tl = 1; tl < cfg->ts_number_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, cfg->ts_number_layers); + if (cfg->layer_target_bitrate[layer] < + cfg->layer_target_bitrate[layer - 1]) ERROR("ts_target_bitrate entries are not increasing"); + } + } RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers - 1], 1, 1); - for (i = cfg->ts_number_layers - 2; i > 0; --i) - if (cfg->ts_rate_decimator[i - 1] != 2 * cfg->ts_rate_decimator[i]) + for (tl = cfg->ts_number_layers - 2; tl > 0; --tl) + if (cfg->ts_rate_decimator[tl - 1] != 2 * cfg->ts_rate_decimator[tl]) ERROR("ts_rate_decimator factors are not powers of 2"); } @@ -360,6 +368,7 @@ static vpx_codec_err_t set_encoder_config( const vpx_codec_enc_cfg_t *cfg, const struct vp9_extracfg *extra_cfg) { const int is_vbr = cfg->rc_end_usage == VPX_VBR; + int sl, tl; oxcf->profile = cfg->g_profile; oxcf->max_threads = (int)cfg->g_threads; oxcf->width = cfg->g_w; @@ -460,35 +469,33 @@ static vpx_codec_err_t set_encoder_config( oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost; oxcf->ss_number_layers = cfg->ss_number_layers; + oxcf->ts_number_layers = cfg->ts_number_layers; + oxcf->temporal_layering_mode = (enum vp9e_temporal_layering_mode) + cfg->temporal_layering_mode; - if (oxcf->ss_number_layers > 1) { - int i; - for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) { - oxcf->ss_target_bitrate[i] = 1000 * cfg->ss_target_bitrate[i]; + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { #if CONFIG_SPATIAL_SVC - oxcf->ss_enable_auto_arf[i] = cfg->ss_enable_auto_alt_ref[i]; + oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl]; #endif + for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { + oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] = + 1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl]; } - } else if (oxcf->ss_number_layers == 1) { + } + if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) { oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth; #if CONFIG_SPATIAL_SVC oxcf->ss_enable_auto_arf[0] = extra_cfg->enable_auto_alt_ref; #endif } - - oxcf->ts_number_layers = cfg->ts_number_layers; - if (oxcf->ts_number_layers > 1) { - int i; - for (i = 0; i < VPX_TS_MAX_LAYERS; ++i) { - oxcf->ts_target_bitrate[i] = 1000 * cfg->ts_target_bitrate[i]; - oxcf->ts_rate_decimator[i] = cfg->ts_rate_decimator[i]; + for (tl = 0; tl < VPX_TS_MAX_LAYERS; ++tl) { + oxcf->ts_rate_decimator[tl] = cfg->ts_rate_decimator[tl] ? + cfg->ts_rate_decimator[tl] : 1; } } else if (oxcf->ts_number_layers == 1) { - oxcf->ts_target_bitrate[0] = (int)oxcf->target_bandwidth; oxcf->ts_rate_decimator[0] = 1; } - /* printf("Current VP9 Settings: \n"); printf("target_bandwidth: %d\n", oxcf->target_bandwidth); @@ -902,11 +909,12 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, unsigned int lib_flags) { vpx_codec_frame_flags_t flags = lib_flags << 16; - if (lib_flags & FRAMEFLAGS_KEY -#if CONFIG_SPATIAL_SVC - || (is_two_pass_svc(cpi) && cpi->svc.layer_context[0].is_key_frame) -#endif - ) + if (lib_flags & FRAMEFLAGS_KEY || + (cpi->use_svc && + cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id].is_key_frame) + ) flags |= VPX_FRAME_IS_KEY; if (cpi->droppable) @@ -1022,16 +1030,15 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, vpx_codec_cx_pkt_t pkt; #if CONFIG_SPATIAL_SVC - if (is_two_pass_svc(cpi)) - cpi->svc.layer_context[cpi->svc.spatial_layer_id].layer_size += size; + if (cpi->use_svc) + cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers].layer_size += size; #endif // Pack invisible frames with the next visible frame - if (!cpi->common.show_frame -#if CONFIG_SPATIAL_SVC - || (is_two_pass_svc(cpi) && - cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) -#endif + if (!cpi->common.show_frame || + (cpi->use_svc && + cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) ) { if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data; @@ -1089,24 +1096,27 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, pkt.data.frame.partition_id = -1; if(ctx->output_cx_pkt_cb.output_cx_pkt) - ctx->output_cx_pkt_cb.output_cx_pkt(&pkt, ctx->output_cx_pkt_cb.user_priv); + ctx->output_cx_pkt_cb.output_cx_pkt(&pkt, + ctx->output_cx_pkt_cb.user_priv); else vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); cx_data += size; cx_data_sz -= size; +#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) #if CONFIG_SPATIAL_SVC - if (is_two_pass_svc(cpi) && !ctx->output_cx_pkt_cb.output_cx_pkt) { + if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) { vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr; - int i; + int sl; vp9_zero(pkt_sizes); vp9_zero(pkt_psnr); pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES; pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR; - for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { - LAYER_CONTEXT *lc = &cpi->svc.layer_context[i]; - pkt_sizes.data.layer_sizes[i] = lc->layer_size; - pkt_psnr.data.layer_psnr[i] = lc->psnr_pkt; + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { + LAYER_CONTEXT *lc = + &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers]; + pkt_sizes.data.layer_sizes[sl] = lc->layer_size; + pkt_psnr.data.layer_psnr[sl] = lc->psnr_pkt; lc->layer_size = 0; } @@ -1115,6 +1125,12 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr); } #endif +#endif + if (is_one_pass_cbr_svc(cpi) && + (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { + // Encoded all spatial layers; exit loop. + break; + } } } } @@ -1292,16 +1308,20 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) { int data = va_arg(args, int); const vpx_codec_enc_cfg_t *cfg = &ctx->cfg; + // Both one-pass and two-pass RC are supported now. + // User setting this has to make sure of the following. + // In two-pass setting: either (but not both) + // cfg->ss_number_layers > 1, or cfg->ts_number_layers > 1 + // In one-pass setting: + // either or both cfg->ss_number_layers > 1, or cfg->ts_number_layers > 1 vp9_set_svc(ctx->cpi, data); - // CBR or two pass mode for SVC with both temporal and spatial layers - // not yet supported. + if (data == 1 && - (cfg->rc_end_usage == VPX_CBR || - cfg->g_pass == VPX_RC_FIRST_PASS || + (cfg->g_pass == VPX_RC_FIRST_PASS || cfg->g_pass == VPX_RC_LAST_PASS) && - cfg->ss_number_layers > 1 && - cfg->ts_number_layers > 1) { + cfg->ss_number_layers > 1 && + cfg->ts_number_layers > 1) { return VPX_CODEC_INVALID_PARAM; } return VPX_CODEC_OK; @@ -1313,9 +1333,7 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx, VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; SVC *const svc = &cpi->svc; -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) svc->spatial_layer_id = data->spatial_layer_id; -#endif svc->temporal_layer_id = data->temporal_layer_id; // Checks on valid layer_id input. if (svc->temporal_layer_id < 0 || @@ -1335,9 +1353,7 @@ static vpx_codec_err_t ctrl_get_svc_layer_id(vpx_codec_alg_priv_t *ctx, VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; SVC *const svc = &cpi->svc; -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) data->spatial_layer_id = svc->spatial_layer_id; -#endif data->temporal_layer_id = svc->temporal_layer_id; return VPX_CODEC_OK; @@ -1347,15 +1363,21 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx, va_list args) { VP9_COMP *const cpi = ctx->cpi; vpx_svc_extra_cfg_t *const params = va_arg(args, vpx_svc_extra_cfg_t *); - int i; - - for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { - LAYER_CONTEXT *lc = &cpi->svc.layer_context[i]; - - lc->max_q = params->max_quantizers[i]; - lc->min_q = params->min_quantizers[i]; - lc->scaling_factor_num = params->scaling_factor_num[i]; - lc->scaling_factor_den = params->scaling_factor_den[i]; + int sl, tl; + + // Number of temporal layers and number of spatial layers have to be set + // properly before calling this control function. + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { + for (tl = 0; tl < cpi->svc.number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, cpi->svc.number_temporal_layers); + LAYER_CONTEXT *lc = + &cpi->svc.layer_context[layer]; + lc->max_q = params->max_quantizers[sl]; + lc->min_q = params->min_quantizers[sl]; + lc->scaling_factor_num = params->scaling_factor_num[sl]; + lc->scaling_factor_den = params->scaling_factor_den[sl]; + } } return VPX_CODEC_OK; @@ -1416,10 +1438,8 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP9E_SET_AQ_MODE, ctrl_set_aq_mode}, {VP9E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost}, {VP9E_SET_SVC, ctrl_set_svc}, -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) {VP9E_SET_SVC_PARAMETERS, ctrl_set_svc_parameters}, {VP9E_REGISTER_CX_CALLBACK, ctrl_register_cx_callback}, -#endif {VP9E_SET_SVC_LAYER_ID, ctrl_set_svc_layer_id}, {VP9E_SET_TUNE_CONTENT, ctrl_set_tune_content}, {VP9E_SET_COLOR_SPACE, ctrl_set_color_space}, @@ -1429,9 +1449,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer}, {VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64}, {VP9_GET_REFERENCE, ctrl_get_reference}, -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) {VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id}, -#endif {VP9E_GET_ACTIVEMAP, ctrl_get_active_map}, { -1, NULL}, @@ -1495,6 +1513,8 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { {0}, // ts_rate_decimator 0, // ts_periodicity {0}, // ts_layer_id + {0}, // layer_taget_bitrate + 0 // temporal_layering_mode } }, }; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c index ff76204d822..4080d64c170 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c @@ -55,6 +55,7 @@ struct vpx_codec_alg_priv { int invert_tile_order; int last_show_frame; // Index of last output frame. int byte_alignment; + int skip_loop_filter; // Frame parallel related. int frame_parallel_decode; // frame-based threading. @@ -285,6 +286,7 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { cm->new_fb_idx = INVALID_IDX; cm->byte_alignment = ctx->byte_alignment; + cm->skip_loop_filter = ctx->skip_loop_filter; if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { pool->get_fb_cb = ctx->get_ext_fb_cb; @@ -937,7 +939,8 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, frame_worker_data->pbi->common.buffer_pool->frame_bufs; if (frame_worker_data->pbi->common.frame_to_show == NULL) return VPX_CODEC_ERROR; - *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted; + if (ctx->last_show_frame >= 0) + *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted; return VPX_CODEC_OK; } else { return VPX_CODEC_ERROR; @@ -1058,6 +1061,19 @@ static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->skip_loop_filter = va_arg(args, int); + + if (ctx->frame_workers) { + VP9Worker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter; + } + + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { {VP8_COPY_REFERENCE, ctrl_copy_reference}, @@ -1071,6 +1087,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { {VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order}, {VPXD_SET_DECRYPTOR, ctrl_set_decryptor}, {VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment}, + {VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter}, // Getters {VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates}, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h index e585aa14725..58bb7d5d648 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h @@ -10,6 +10,8 @@ #ifndef VP9_VP9_IFACE_COMMON_H_ #define VP9_VP9_IFACE_COMMON_H_ +#include "vpx_ports/mem.h" + static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, void *user_priv) { /** vpx_img_wrap() doesn't allow specifying independent strides for diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk b/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk index 7359b2de05d..e78c111f085 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk +++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk @@ -102,13 +102,11 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c endif @@ -134,14 +132,14 @@ VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.h -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_impl_sse2.c +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2_impl.h +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2_impl.h ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c endif -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2_impl.h VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c @@ -154,4 +152,14 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c + VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/exports_dec b/chromium/third_party/libvpx/source/libvpx/vpx/exports_dec index 3ce1499b77d..c694ebae128 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/exports_dec +++ b/chromium/third_party/libvpx/source/libvpx/vpx/exports_dec @@ -1,10 +1,8 @@ text vpx_codec_dec_init_ver text vpx_codec_decode text vpx_codec_get_frame -text vpx_codec_get_mem_map text vpx_codec_get_stream_info text vpx_codec_peek_stream_info text vpx_codec_register_put_frame_cb text vpx_codec_register_put_slice_cb text vpx_codec_set_frame_buffer_functions -text vpx_codec_set_mem_map diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c index e711cf909ba..9844ace54dc 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c @@ -302,31 +302,79 @@ void assign_layer_bitrates(const SvcContext *svc_ctx, vpx_codec_enc_cfg_t *const enc_cfg) { int i; const SvcInternal_t *const si = get_const_svc_internal(svc_ctx); + int sl, tl, spatial_layer_target; + + if (svc_ctx->temporal_layering_mode != 0) { + if (si->bitrates[0] != 0) { + enc_cfg->rc_target_bitrate = 0; + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + enc_cfg->ss_target_bitrate[sl*svc_ctx->temporal_layers] = 0; + for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { + enc_cfg->ss_target_bitrate[sl*svc_ctx->temporal_layers] + += (unsigned int)si->bitrates[sl * svc_ctx->temporal_layers + tl]; + enc_cfg->layer_target_bitrate[sl*svc_ctx->temporal_layers + tl] + = si->bitrates[sl * svc_ctx->temporal_layers + tl]; + } + } + } else { + float total = 0; + float alloc_ratio[VPX_MAX_LAYERS] = {0}; + + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + if (si->svc_params.scaling_factor_den[sl] > 0) { + alloc_ratio[sl] = (float)(si->svc_params.scaling_factor_num[sl] * + 1.0 / si->svc_params.scaling_factor_den[sl]); + total += alloc_ratio[sl]; + } + } - if (si->bitrates[0] != 0) { - enc_cfg->rc_target_bitrate = 0; - for (i = 0; i < svc_ctx->spatial_layers; ++i) { - enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i]; - enc_cfg->rc_target_bitrate += si->bitrates[i]; + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + enc_cfg->ss_target_bitrate[sl] = spatial_layer_target = + (unsigned int)(enc_cfg->rc_target_bitrate * + alloc_ratio[sl] / total); + if (svc_ctx->temporal_layering_mode == 3) { + enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers] = + spatial_layer_target >> 1; + enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 1] = + (spatial_layer_target >> 1) + (spatial_layer_target >> 2); + enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 2] = + spatial_layer_target; + } else if (svc_ctx->temporal_layering_mode == 2) { + enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers] = + spatial_layer_target * 2 / 3; + enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 1] = + spatial_layer_target; + } else { + // User should explicitly assign bitrates in this case. + assert(0); + } + } } } else { - float total = 0; - float alloc_ratio[VPX_SS_MAX_LAYERS] = {0}; + if (si->bitrates[0] != 0) { + enc_cfg->rc_target_bitrate = 0; + for (i = 0; i < svc_ctx->spatial_layers; ++i) { + enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i]; + enc_cfg->rc_target_bitrate += si->bitrates[i]; + } + } else { + float total = 0; + float alloc_ratio[VPX_MAX_LAYERS] = {0}; - for (i = 0; i < svc_ctx->spatial_layers; ++i) { - if (si->svc_params.scaling_factor_den[i] > 0) { - alloc_ratio[i] = (float)(si->svc_params.scaling_factor_num[i] * 1.0 / - si->svc_params.scaling_factor_den[i]); + for (i = 0; i < svc_ctx->spatial_layers; ++i) { + if (si->svc_params.scaling_factor_den[i] > 0) { + alloc_ratio[i] = (float)(si->svc_params.scaling_factor_num[i] * 1.0 / + si->svc_params.scaling_factor_den[i]); - alloc_ratio[i] *= alloc_ratio[i]; - total += alloc_ratio[i]; + alloc_ratio[i] *= alloc_ratio[i]; + total += alloc_ratio[i]; + } } - } - - for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) { - if (total > 0) { - enc_cfg->ss_target_bitrate[i] = (unsigned int) - (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total); + for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) { + if (total > 0) { + enc_cfg->layer_target_bitrate[i] = (unsigned int) + (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total); + } } } } @@ -365,6 +413,14 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, return VPX_CODEC_INVALID_PARAM; } + // Note: temporal_layering_mode only applies to one-pass CBR + // si->svc_params.temporal_layering_mode = svc_ctx->temporal_layering_mode; + if (svc_ctx->temporal_layering_mode == 3) { + svc_ctx->temporal_layers = 3; + } else if (svc_ctx->temporal_layering_mode == 2) { + svc_ctx->temporal_layers = 2; + } + for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) { si->svc_params.max_quantizers[i] = MAX_QUANTIZER; si->svc_params.min_quantizers[i] = 0; @@ -387,6 +443,14 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, if (svc_ctx->temporal_layers > VPX_TS_MAX_LAYERS) svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS; + if (svc_ctx->temporal_layers * svc_ctx->spatial_layers > VPX_MAX_LAYERS) { + svc_log(svc_ctx, SVC_LOG_ERROR, + "spatial layers * temporal layers exceeds the maximum number of " + "allowed layers of %d\n", + svc_ctx->spatial_layers * svc_ctx->temporal_layers, + (int) VPX_MAX_LAYERS); + return VPX_CODEC_INVALID_PARAM; + } assign_layer_bitrates(svc_ctx, enc_cfg); #if CONFIG_SPATIAL_SVC @@ -403,10 +467,24 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, } } - // modify encoder configuration + if (svc_ctx->threads) + enc_cfg->g_threads = svc_ctx->threads; + + // Modify encoder configuration enc_cfg->ss_number_layers = svc_ctx->spatial_layers; enc_cfg->ts_number_layers = svc_ctx->temporal_layers; + if (enc_cfg->rc_end_usage == VPX_CBR) { + enc_cfg->rc_resize_allowed = 0; + enc_cfg->rc_min_quantizer = 2; + enc_cfg->rc_max_quantizer = 63; + enc_cfg->rc_undershoot_pct = 50; + enc_cfg->rc_overshoot_pct = 50; + enc_cfg->rc_buf_initial_sz = 20; + enc_cfg->rc_buf_optimal_sz = 600; + enc_cfg->rc_buf_sz = 1000; + } + if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0) enc_cfg->g_error_resilient = 1; @@ -451,6 +529,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, iter = NULL; while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) { switch (cx_pkt->kind) { +#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) #if CONFIG_SPATIAL_SVC case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: { int i; @@ -489,6 +568,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, break; } #endif +#endif default: { break; } @@ -554,7 +634,7 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) { mse[1], mse[2], mse[3]); bytes_total += si->bytes_sum[i]; - // clear sums for next time + // Clear sums for next time. si->bytes_sum[i] = 0; for (j = 0; j < COMPONENTS; ++j) { si->psnr_sum[i][j] = 0; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h b/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h index cf791bdeb56..a09651cc991 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h @@ -33,10 +33,13 @@ typedef struct { // public interface to svc_command options int spatial_layers; // number of spatial layers int temporal_layers; // number of temporal layers + int temporal_layering_mode; SVC_LOG_LEVEL log_level; // amount of information to display int log_print; // when set, printf log messages instead of returning the // message with svc_get_message - + int output_rc_stat; // for outputting rc stats + int speed; // speed setting for codec + int threads; // private storage for vpx_svc_encode void *internal; } SvcContext; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h index 0e8adc134c5..19bc4bdcce1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h @@ -327,6 +327,8 @@ enum vp8e_enc_control_id { /*!\brief Codec control function to set encoder screen content mode. * + * 0: off, 1: On, 2: On with more aggressive rate control. + * * Supported in codecs: VP8 */ VP8E_SET_SCREEN_CONTENT_MODE, @@ -448,7 +450,6 @@ enum vp8e_enc_control_id { */ VP9E_SET_SVC, -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) /*!\brief Codec control function to set parameters for SVC. * \note Parameters contain min_q, max_q, scaling factor for each of the * SVC layers. @@ -456,7 +457,6 @@ enum vp8e_enc_control_id { * Supported in codecs: VP9 */ VP9E_SET_SVC_PARAMETERS, -#endif /*!\brief Codec control function to set svc layer for spatial and temporal. * \note Valid ranges: 0..#vpx_codec_enc_cfg::ss_number_layers for spatial @@ -476,7 +476,6 @@ enum vp8e_enc_control_id { */ VP9E_SET_TUNE_CONTENT, -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) /*!\brief Codec control function to get svc layer ID. * \note The layer ID returned is for the data packet from the registered * callback function. @@ -492,7 +491,6 @@ enum vp8e_enc_control_id { * Supported in codecs: VP9 */ VP9E_REGISTER_CX_CALLBACK, -#endif /*!\brief Codec control function to set color space info. * \note Valid ranges: 0..7, default is "UNKNOWN". @@ -509,6 +507,17 @@ enum vp8e_enc_control_id { */ VP9E_SET_COLOR_SPACE, + /*!\brief Codec control function to set temporal layering mode. + * \note Valid ranges: 0..3, default is "0" (VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING). + * 0 = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING + * 1 = VP9E_TEMPORAL_LAYERING_MODE_BYPASS + * 2 = VP9E_TEMPORAL_LAYERING_MODE_0101 + * 3 = VP9E_TEMPORAL_LAYERING_MODE_0212 + * + * Supported in codecs: VP9 + */ + VP9E_SET_TEMPORAL_LAYERING_MODE, + /*!\brief Codec control function to get an Active map back from the encoder. * * Supported in codecs: VP9 @@ -527,6 +536,32 @@ typedef enum vpx_scaling_mode_1d { VP8E_ONETWO = 3 } VPX_SCALING_MODE; +/*!\brief Temporal layering mode enum for VP9 SVC. + * + * This set of macros define the different temporal layering modes. + * Supported codecs: VP9 (in SVC mode) + * + */ +typedef enum vp9e_temporal_layering_mode { + /*!\brief No temporal layering. + * Used when only spatial layering is used. + */ + VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING = 0, + + /*!\brief Bypass mode. + * Used when application needs to control temporal layering. + * This will only work when the number of spatial layers equals 1. + */ + VP9E_TEMPORAL_LAYERING_MODE_BYPASS = 1, + + /*!\brief 0-1-0-1... temporal layering scheme with two temporal layers. + */ + VP9E_TEMPORAL_LAYERING_MODE_0101 = 2, + + /*!\brief 0-2-1-2... temporal layering scheme with three temporal layers. + */ + VP9E_TEMPORAL_LAYERING_MODE_0212 = 3 +} VP9E_TEMPORAL_LAYERING_MODE; /*!\brief vpx region of interest map * @@ -602,7 +637,6 @@ typedef enum { VP8_TUNE_SSIM } vp8e_tuning; -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) /*!\brief vp9 svc layer parameters * * This defines the spatial and temporal layer id numbers for svc encoding. @@ -614,18 +648,6 @@ typedef struct vpx_svc_layer_id { int spatial_layer_id; /**< Spatial layer id number. */ int temporal_layer_id; /**< Temporal layer id number. */ } vpx_svc_layer_id_t; -#else -/*!\brief vp9 svc layer parameters - * - * This defines the temporal layer id numbers for svc encoding. - * This is used with the #VP9E_SET_SVC_LAYER_ID control to set the - * temporal layer id for the current frame. - * - */ -typedef struct vpx_svc_layer_id { - int temporal_layer_id; /**< Temporal layer id number. */ -} vpx_svc_layer_id_t; -#endif /*!\brief VP8 encoder control function parameter type * @@ -649,10 +671,8 @@ VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *) VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *) VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int) -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *) VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *) -#endif VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *) VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int) @@ -673,9 +693,7 @@ VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int) VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *) VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *) -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *) -#endif VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int) diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h index 83898bf8496..bc9cb1a62fc 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h @@ -106,6 +106,13 @@ enum vp8_dec_control_id { */ VP9_INVERT_TILE_DECODE_ORDER, + /** control function to set the skip loop filter flag. Valid values are + * integers. The decoder will skip the loop filter when its value is set to + * nonzero. If the loop filter is skipped the decoder may accumulate decode + * artifacts. The default value is 0. + */ + VP9_SET_SKIP_LOOP_FILTER, + VP8_DECODER_CTRL_ID_MAX }; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.mk b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.mk index a1ad3c5312c..ccdef040c3a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.mk +++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.mk @@ -31,17 +31,17 @@ API_DOC_SRCS-yes += vpx_encoder.h API_DOC_SRCS-yes += vpx_frame_buffer.h API_DOC_SRCS-yes += vpx_image.h -API_SRCS-yes += src/vpx_decoder.c -API_SRCS-yes += vpx_decoder.h -API_SRCS-yes += src/vpx_encoder.c -API_SRCS-yes += vpx_encoder.h -API_SRCS-yes += internal/vpx_codec_internal.h -API_SRCS-yes += internal/vpx_psnr.h -API_SRCS-yes += src/vpx_codec.c -API_SRCS-yes += src/vpx_image.c -API_SRCS-yes += src/vpx_psnr.c -API_SRCS-yes += vpx_codec.h -API_SRCS-yes += vpx_codec.mk -API_SRCS-yes += vpx_frame_buffer.h -API_SRCS-yes += vpx_image.h -API_SRCS-$(BUILD_LIBVPX) += vpx_integer.h +API_SRCS-yes += src/vpx_decoder.c +API_SRCS-yes += vpx_decoder.h +API_SRCS-yes += src/vpx_encoder.c +API_SRCS-yes += vpx_encoder.h +API_SRCS-yes += internal/vpx_codec_internal.h +API_SRCS-yes += internal/vpx_psnr.h +API_SRCS-yes += src/vpx_codec.c +API_SRCS-yes += src/vpx_image.c +API_SRCS-yes += src/vpx_psnr.c +API_SRCS-yes += vpx_codec.h +API_SRCS-yes += vpx_codec.mk +API_SRCS-yes += vpx_frame_buffer.h +API_SRCS-yes += vpx_image.h +API_SRCS-yes += vpx_integer.h diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h index bf75584d589..2b17f98a231 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h @@ -42,8 +42,11 @@ extern "C" { /*!\deprecated Use #VPX_TS_MAX_PERIODICITY instead. */ #define MAX_PERIODICITY VPX_TS_MAX_PERIODICITY - /*!\deprecated Use #VPX_TS_MAX_LAYERS instead. */ -#define MAX_LAYERS VPX_TS_MAX_LAYERS +/*! Temporal+Spatial Scalability: Maximum number of coding layers */ +#define VPX_MAX_LAYERS 12 // 3 temporal + 4 spatial layers are allowed. + +/*!\deprecated Use #VPX_MAX_LAYERS instead. */ +#define MAX_LAYERS VPX_MAX_LAYERS // 3 temporal + 4 spatial layers allowed. /*! Spatial Scalability: Maximum number of coding layers */ #define VPX_SS_MAX_LAYERS 5 @@ -59,7 +62,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_ENCODER_ABI_VERSION (4 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ +#define VPX_ENCODER_ABI_VERSION (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield @@ -163,7 +166,7 @@ extern "C" { VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ // Spatial SVC is still experimental and may be removed before the next ABI // bump. -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) +#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/ VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/ #endif @@ -205,7 +208,7 @@ extern "C" { vpx_fixed_buf_t raw; /**< data for arbitrary packets */ // Spatial SVC is still experimental and may be removed before the next // ABI bump. -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) +#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) size_t layer_sizes[VPX_SS_MAX_LAYERS]; struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS]; #endif @@ -729,6 +732,22 @@ extern "C" { * ts_periodicity=8, then ts_layer_id = (0,1,0,1,0,1,0,1). */ unsigned int ts_layer_id[VPX_TS_MAX_PERIODICITY]; + + /*!\brief Target bitrate for each spatial/temporal layer. + * + * These values specify the target coding bitrate to be used for each + * spatial/temporal layer. + * + */ + unsigned int layer_target_bitrate[VPX_MAX_LAYERS]; + + /*!\brief Temporal layering mode indicating which temporal layering scheme to use. + * + * The value (refer to VP9E_TEMPORAL_LAYERING_MODE) specifies the + * temporal layering mode to use. + * + */ + int temporal_layering_mode; } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ /*!\brief vp9 svc extra configure parameters @@ -737,10 +756,11 @@ extern "C" { * */ typedef struct vpx_svc_parameters { - int max_quantizers[VPX_SS_MAX_LAYERS]; /**< Max Q for each layer */ - int min_quantizers[VPX_SS_MAX_LAYERS]; /**< Min Q for each layer */ - int scaling_factor_num[VPX_SS_MAX_LAYERS]; /**< Scaling factor-numerator*/ - int scaling_factor_den[VPX_SS_MAX_LAYERS]; /**< Scaling factor-denominator*/ + int max_quantizers[VPX_MAX_LAYERS]; /**< Max Q for each layer */ + int min_quantizers[VPX_MAX_LAYERS]; /**< Min Q for each layer */ + int scaling_factor_num[VPX_MAX_LAYERS]; /**< Scaling factor-numerator */ + int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */ + int temporal_layering_mode; /**< Temporal layering mode */ } vpx_svc_extra_cfg_t; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_media.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_media.asm new file mode 100644 index 00000000000..f7f9e14b0a7 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_media.asm @@ -0,0 +1,358 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vpx_variance16x16_media| + EXPORT |vpx_variance8x8_media| + EXPORT |vpx_mse16x16_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vpx_variance16x16_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + +loop16x16 + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r5, [r2, #0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r5, [r2, #4] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r5, [r2, #8] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r5, [r2, #12] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop16x16 + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vpx_variance8x8_media| PROC + + push {r4-r10, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #8 ; set loop counter to 8 (=block height) + mov r4, #0 ; initialize sum = 0 + mov r5, #0 ; initialize sse = 0 + +loop8x8 + ; 1st 4 pixels + ldr r6, [r0, #0x0] ; load 4 src pixels + ldr r7, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r6, r7 ; calculate difference + pld [r0, r1, lsl #1] + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r0, #0x4] ; load 4 src pixels + ldr r7, [r2, #0x4] ; load 4 ref pixels + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r6, r7 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 ; next row + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + bne loop8x8 + + ; return stuff + ldr r8, [sp, #32] ; get address of sse + mul r1, r4, r4 ; sum * sum + str r5, [r8] ; store sse + sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) + + pop {r4-r10, pc} + + ENDP + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +; +;note: Based on vpx_variance16x16_media. In this function, sum is never used. +; So, we can remove this part of calculation. + +|vpx_mse16x16_media| PROC + + push {r4-r9, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #16 ; set loop counter to 16 (=block height) + mov r4, #0 ; initialize sse = 0 + +loopmse + ; 1st 4 pixels + ldr r5, [r0, #0x0] ; load 4 src pixels + ldr r6, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r5, r6 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0x4] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r2, #0x4] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + ldr r5, [r0, #0x8] ; load 4 src pixels + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r6, [r2, #0x8] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0xc] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r6, [r2, #0xc] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + subs r12, r12, #1 ; next row + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + bne loopmse + + ; return stuff + ldr r1, [sp, #28] ; get address of sse + mov r0, r4 ; return sse + str r4, [r1] ; store sse + + pop {r4-r9, pc} + + ENDP + + END diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c new file mode 100644 index 00000000000..ede6e7bbb03 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +// w * h must be less than 2048 or local variable v_sum may overflow. +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, uint32_t *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void vpx_get8x8var_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); +} + +void vpx_get16x16var_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); +} + +unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 +} + +unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 +} + +unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 +} + +unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); + variance_neon_w8(a + (32 * a_stride), a_stride, + b + (32 * b_stride), b_stride, 32, 32, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, + b + (16 * b_stride), b_stride, 64, 16, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, + b + (16 * b_stride), b_stride, 64, 16, + &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 2 * a_stride), a_stride, + b + (16 * 2 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 3 * a_stride), a_stride, + b + (16 * 3 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 +} + +unsigned int vpx_variance16x8_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vpx_variance8x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d2u8, d4u8, d6u8; + int16x4_t d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d2u8, d6u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vpx_mse16x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int64x1_t d0s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + q7s32 = vdupq_n_s32(0); + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // mse16x16_neon_loop + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q7s32 = vmlal_s16(q7s32, d22s16, d22s16); + q8s32 = vmlal_s16(q8s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q7s32 = vmlal_s16(q7s32, d26s16, d26s16); + q8s32 = vmlal_s16(q8s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q10s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q10s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} + +unsigned int vpx_get4x4sse_cs_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride) { + int16x4_t d22s16, d24s16, d26s16, d28s16; + int64x1_t d0s64; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d3u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d7u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); + d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); + d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); + d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); + + q7s32 = vmull_s16(d22s16, d22s16); + q8s32 = vmull_s16(d24s16, d24s16); + q9s32 = vmull_s16(d26s16, d26s16); + q10s32 = vmull_s16(d28s16, d28s16); + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q9s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q9s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c index 9db312fbe05..c0c3ff99645 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c @@ -14,13 +14,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" - -#if CONFIG_VP9_HIGHBITDEPTH -#include "vp9/common/vp9_common.h" -#endif // CONFIG_VP9_HIGHBITDEPTH -// Temporary ... -#define ROUND_POWER_OF_TWO(value, n) \ - (((value) + (1 << ((n) - 1))) >> (n)) +#include "vpx_ports/mem.h" /* Sum the difference between every corresponding element of the buffers. */ static INLINE unsigned int sad(const uint8_t *a, int a_stride, @@ -39,6 +33,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, return sad; } +// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up. /* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred. * The function averages every corresponding element of the buffers and stores * the value in a third buffer, comp_pred. diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c new file mode 100644 index 00000000000..084dd7b7ead --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride) { + int distortion = 0; + int r, c; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + int diff = a[c] - b[c]; + distortion += diff * diff; + } + + a += a_stride; + b += b_stride; + } + + return distortion; +} + +unsigned int vpx_get_mb_ss_c(const int16_t *a) { + unsigned int i, sum = 0; + + for (i = 0; i < 256; ++i) { + sum += a[i] * a[i]; + } + + return sum; +} + +static void variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +#define VAR(W, H) \ +unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ +void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse, int *sum) { \ + variance(a, a_stride, b, b_stride, W, H, sse, sum); \ +} + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * variable. + */ +#define MSE(W, H) \ +unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse; \ +} + +VAR(64, 64) +VAR(64, 32) +VAR(32, 64) +VAR(32, 32) +VAR(32, 16) +VAR(16, 32) +VAR(16, 16) +VAR(16, 8) +VAR(8, 16) +VAR(8, 8) +VAR(8, 4) +VAR(4, 8) +VAR(4, 4) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, uint64_t *sse, uint64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +static void highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} + +static void highbd_10_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static void highbd_12_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ +unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} \ +\ +unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} \ +\ +unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} + +#define HIGHBD_GET_VAR(S) \ +void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} \ +\ +void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} \ +\ +void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} + +#define HIGHBD_MSE(W, H) \ +unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} \ +\ +unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} \ +\ +unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) + +HIGHBD_VAR(64, 64) +HIGHBD_VAR(64, 32) +HIGHBD_VAR(32, 64) +HIGHBD_VAR(32, 32) +HIGHBD_VAR(32, 16) +HIGHBD_VAR(16, 32) +HIGHBD_VAR(16, 16) +HIGHBD_VAR(16, 8) +HIGHBD_VAR(8, 16) +HIGHBD_VAR(8, 8) +HIGHBD_VAR(8, 4) +HIGHBD_VAR(4, 8) +HIGHBD_VAR(4, 4) + +void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk index 606515d2c19..f23534adc15 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk @@ -17,6 +17,7 @@ DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c + DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm @@ -29,9 +30,28 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm + endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS +ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),) +DSP_SRCS-yes += variance.c + +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c + +DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c +DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm +DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c + +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm +endif # CONFIG_VP9_HIGHBITDEPTH +endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC + DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) DSP_SRCS-yes += vpx_dsp_rtcd.c diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index ebec9ec0660..55271cf9c14 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -392,4 +392,212 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_ENCODERS +if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { + +add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance64x64 sse2 avx2 neon/; + +add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance64x32 sse2 avx2 neon/; + +add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x64 sse2 neon/; + +add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x32 sse2 avx2 neon/; + +add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x16 sse2 avx2/; + +add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x32 sse2/; + +add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon/; + +add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x8 mmx sse2 neon/; + +add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x16 mmx sse2 neon/; + +add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x8 mmx sse2 media neon/; + +add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x4 sse2/; + +add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance4x8 sse2/; + +add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance4x4 mmx sse2/; + + +add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_get16x16var sse2 avx2 neon/; + +add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_get8x8var mmx sse2 neon/; + +add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon/; + +add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse16x8 sse2/; + +add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse8x16 sse2/; + +add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse8x8 sse2/; + +add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; + specialize qw/vpx_get_mb_ss mmx sse2/; + +add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; + specialize qw/vpx_get4x4sse_cs neon/; + +add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; + +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance64x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance64x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance32x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance32x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance32x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance16x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance16x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance8x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance64x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance64x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance32x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance32x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance32x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance16x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance16x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance8x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance64x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance64x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance32x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance32x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance32x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance16x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance16x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance8x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse8x8 sse2/; + + add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; +} # CONFIG_VP9_HIGHBITDEPTH +} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC + 1; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm index 821dd0660bc..923418a9921 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm @@ -11,7 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" -;unsigned int vp9_highbd_calc16x16var_sse2 +;unsigned int vpx_highbd_calc16x16var_sse2 ;( ; unsigned char * src_ptr, ; int source_stride, @@ -20,8 +20,8 @@ ; unsigned int * SSE, ; int * Sum ;) -global sym(vp9_highbd_calc16x16var_sse2) PRIVATE -sym(vp9_highbd_calc16x16var_sse2): +global sym(vpx_highbd_calc16x16var_sse2) PRIVATE +sym(vpx_highbd_calc16x16var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -164,7 +164,7 @@ sym(vp9_highbd_calc16x16var_sse2): ret -;unsigned int vp9_highbd_calc8x8var_sse2 +;unsigned int vpx_highbd_calc8x8var_sse2 ;( ; unsigned char * src_ptr, ; int source_stride, @@ -173,8 +173,8 @@ sym(vp9_highbd_calc16x16var_sse2): ; unsigned int * SSE, ; int * Sum ;) -global sym(vp9_highbd_calc8x8var_sse2) PRIVATE -sym(vp9_highbd_calc8x8var_sse2): +global sym(vpx_highbd_calc8x8var_sse2) PRIVATE +sym(vpx_highbd_calc8x8var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_sse2.c new file mode 100644 index 00000000000..343c0478b9a --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_sse2.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" + +#include "vp9/encoder/vp9_variance.h" +#include "vpx_ports/mem.h" + +typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = ROUND_POWER_OF_TWO(sse_long, 8); +} + + +#define HIGH_GET_VAR(S) \ +void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ +} \ +\ +void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ +} \ +\ +void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ +} + +HIGH_GET_VAR(16); +HIGH_GET_VAR(8); + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ +uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, \ + block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} + +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); + +#undef VAR_FN + +unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c index 4128f2ac37c..793658f9ea9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c @@ -8,18 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ #include <immintrin.h> // AVX2 +#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_sad32x32x4d_avx2(uint8_t *src, +void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride, - uint8_t *ref[4], + const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) { __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; __m256i sum_mlow, sum_mhigh; int i; - uint8_t *ref0, *ref1, *ref2, *ref3; + const uint8_t *ref0, *ref1, *ref2, *ref3; ref0 = ref[0]; ref1 = ref[1]; @@ -31,11 +32,11 @@ void vpx_sad32x32x4d_avx2(uint8_t *src, sum_ref3 = _mm256_set1_epi16(0); for (i = 0; i < 32 ; i++) { // load src and all refs - src_reg = _mm256_loadu_si256((__m256i *)(src)); - ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); - ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); - ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); - ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); + src_reg = _mm256_loadu_si256((const __m256i *)src); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); // sum of the absolute differences between every ref-i to src ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); @@ -80,9 +81,9 @@ void vpx_sad32x32x4d_avx2(uint8_t *src, } } -void vpx_sad64x64x4d_avx2(uint8_t *src, +void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride, - uint8_t *ref[4], + const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) { __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; @@ -91,7 +92,7 @@ void vpx_sad64x64x4d_avx2(uint8_t *src, __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; __m256i sum_mlow, sum_mhigh; int i; - uint8_t *ref0, *ref1, *ref2, *ref3; + const uint8_t *ref0, *ref1, *ref2, *ref3; ref0 = ref[0]; ref1 = ref[1]; @@ -103,16 +104,16 @@ void vpx_sad64x64x4d_avx2(uint8_t *src, sum_ref3 = _mm256_set1_epi16(0); for (i = 0; i < 64 ; i++) { // load 64 bytes from src and all refs - src_reg = _mm256_loadu_si256((__m256i *)(src)); - srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32)); - ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); - ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32)); - ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); - ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32)); - ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); - ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32)); - ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); - ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32)); + src_reg = _mm256_loadu_si256((const __m256i *)src); + srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); + ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); // sum of the absolute differences between every ref-i to src ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c index 78536a47218..ce9ad8f780c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ #include <immintrin.h> +#include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" #define FSAD64_H(h) \ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_avx2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_avx2.c new file mode 100644 index 00000000000..82cef4af0af --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_avx2.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_dsp_rtcd.h" + +typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void vpx_get32x32var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +static void variance_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + get_var_avx2 var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += 16) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(&src[src_stride * i + j], src_stride, + &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + + +unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_get16x16var_avx2, 16); + return *sse - (((unsigned int)sum * sum) >> 8); +} + +unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse; +} + +unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 11); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_impl_avx2.c index f9923280a34..0e40959aa9d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_impl_avx2.c @@ -10,7 +10,9 @@ #include <immintrin.h> // AVX2 -void vp9_get16x16var_avx2(const unsigned char *src_ptr, +#include "./vpx_dsp_rtcd.h" + +void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, @@ -121,7 +123,7 @@ void vp9_get16x16var_avx2(const unsigned char *src_ptr, } } -void vp9_get32x32var_avx2(const unsigned char *src_ptr, +void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_impl_mmx.asm index 7d5e6810bf0..a8d7d99dbc0 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/variance_impl_mmx.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_impl_mmx.asm @@ -11,9 +11,9 @@ %include "vpx_ports/x86_abi_support.asm" -;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) -global sym(vp8_get_mb_ss_mmx) PRIVATE -sym(vp8_get_mb_ss_mmx): +;unsigned int vpx_get_mb_ss_mmx( short *src_ptr ) +global sym(vpx_get_mb_ss_mmx) PRIVATE +sym(vpx_get_mb_ss_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -63,7 +63,7 @@ sym(vp8_get_mb_ss_mmx): ret -;unsigned int vp8_get8x8var_mmx +;void vpx_get8x8var_mmx ;( ; unsigned char *src_ptr, ; int source_stride, @@ -72,8 +72,8 @@ sym(vp8_get_mb_ss_mmx): ; unsigned int *SSE, ; int *Sum ;) -global sym(vp8_get8x8var_mmx) PRIVATE -sym(vp8_get8x8var_mmx): +global sym(vpx_get8x8var_mmx) PRIVATE +sym(vpx_get8x8var_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -310,8 +310,8 @@ sym(vp8_get8x8var_mmx): -;unsigned int -;vp8_get4x4var_mmx +;void +;vpx_get4x4var_mmx ;( ; unsigned char *src_ptr, ; int source_stride, @@ -320,8 +320,8 @@ sym(vp8_get8x8var_mmx): ; unsigned int *SSE, ; int *Sum ;) -global sym(vp8_get4x4var_mmx) PRIVATE -sym(vp8_get4x4var_mmx): +global sym(vpx_get4x4var_mmx) PRIVATE +sym(vpx_get4x4var_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -422,430 +422,3 @@ sym(vp8_get4x4var_mmx): UNSHADOW_ARGS pop rbp ret - - - -;unsigned int -;vp8_get4x4sse_cs_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride -;) -global sym(vp8_get4x4sse_cs_mmx) PRIVATE -sym(vp8_get4x4sse_cs_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - push rbx - ; end prolog - - - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - ; Row 1 - movd mm0, [rax] ; Copy eight bytes to mm0 - movd mm1, [rbx] ; Copy eight bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 2 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm1, mm6 - punpcklbw mm0, mm6 ; unpack to higher prrcision - psubsw mm0, mm1 ; A-B (low order) to MM0 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - movq mm0, mm7 ; - psrlq mm7, 32 - - paddd mm0, mm7 - movq rax, mm0 - - - ; begin epilog - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -%define mmx_filter_shift 7 - -;void vp8_filter_block2d_bil4x4_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE -sym(vp8_filter_block2d_bil4x4_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - - mov rax, arg(4) ;HFilter ; - mov rdx, arg(5) ;VFilter ; - - mov rsi, arg(0) ;ref_ptr ; - mov rdi, arg(2) ;src_ptr ; - - mov rcx, 4 ; - pxor mm0, mm0 ; - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm5, mm1 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - add rsi, r8 -%endif - -.filter_block2d_bil4x4_var_mmx_loop: - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm3, mm5 ; - - movq mm5, mm1 ; - pmullw mm3, [rdx] ; - - pmullw mm1, [rdx+8] ; - paddw mm1, mm3 ; - - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - movd mm3, [rdi] ; - punpcklbw mm3, mm0 ; - - psubw mm1, mm3 ; - paddw mm6, mm1 ; - - pmaddwd mm1, mm1 ; - paddd mm7, mm1 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil4x4_var_mmx_loop ; - - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(6) ;sum - mov rsi, arg(7) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - - -;void vp8_filter_block2d_bil_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE -sym(vp8_filter_block2d_bil_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - mov rax, arg(5) ;HFilter ; - - mov rdx, arg(6) ;VFilter ; - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - - pxor mm0, mm0 ; - movq mm1, [rsi] ; - - movq mm3, [rsi+1] ; - movq mm2, mm1 ; - - movq mm4, mm3 ; - punpcklbw mm1, mm0 ; - - punpckhbw mm2, mm0 ; - pmullw mm1, [rax] ; - - pmullw mm2, [rax] ; - punpcklbw mm3, mm0 ; - - punpckhbw mm4, mm0 ; - pmullw mm3, [rax+8] ; - - pmullw mm4, [rax+8] ; - paddw mm1, mm3 ; - - paddw mm2, mm4 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm2, mmx_filter_shift ; - movq mm5, mm1 - - packuswb mm5, mm2 ; -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - add rsi, r8 -%endif - -.filter_block2d_bil_var_mmx_loop: - - movq mm1, [rsi] ; - movq mm3, [rsi+1] ; - - movq mm2, mm1 ; - movq mm4, mm3 ; - - punpcklbw mm1, mm0 ; - punpckhbw mm2, mm0 ; - - pmullw mm1, [rax] ; - pmullw mm2, [rax] ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - pmullw mm3, [rax+8] ; - pmullw mm4, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - psraw mm2, mmx_filter_shift ; - - movq mm3, mm5 ; - movq mm4, mm5 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - movq mm5, mm1 ; - packuswb mm5, mm2 ; - - pmullw mm3, [rdx] ; - pmullw mm4, [rdx] ; - - pmullw mm1, [rdx+8] ; - pmullw mm2, [rdx+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - psraw mm2, mmx_filter_shift ; - - movq mm3, [rdi] ; - movq mm4, mm3 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - psubw mm1, mm3 ; - psubw mm2, mm4 ; - - paddw mm6, mm1 ; - pmaddwd mm1, mm1 ; - - paddw mm6, mm2 ; - pmaddwd mm2, mm2 ; - - paddd mm7, mm1 ; - paddd mm7, mm2 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil_var_mmx_loop ; - - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(7) ;sum - mov rsi, arg(8) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -;short mmx_bi_rd[4] = { 64, 64, 64, 64}; -align 16 -mmx_bi_rd: - times 4 dw 64 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_mmx.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_mmx.c new file mode 100644 index 00000000000..99dd741bca5 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_mmx.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" + +extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int var; + int avg; + + vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg); + *sse = var; + return (var - (((unsigned int)avg * avg) >> 4)); +} + +unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int var; + int avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg); + *sse = var; + + return (var - (((unsigned int)avg * avg) >> 6)); +} + +unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse2, &sum2); + vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride, + b + 8 * b_stride + 8, b_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + *sse = var; + return var; +} + +unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse2, &sum2); + vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride, + b + 8 * b_stride + 8, b_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + *sse = var; + return (var - (((unsigned int)avg * avg) >> 8)); +} + +unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + return (var - (((unsigned int)avg * avg) >> 7)); +} + +unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + + return (var - (((unsigned int)avg * avg) >> 7)); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_sse2.c new file mode 100644 index 00000000000..6256bc53621 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/variance_sse2.c @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" + +typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} + +#define READ64(p, stride, i) \ + _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) + +static void get4x4var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); + const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); + const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); + const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + // sum + __m128i vsum = _mm_add_epi16(diff0, diff1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), + _mm_madd_epi16(diff1, diff1)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + *sse = _mm_cvtsi128_si32(vsum); +} + +void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; i += 2) { + const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + i * src_stride)), zero); + const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + i * ref_stride)), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + (i + 1) * src_stride)), zero); + const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + (i + 1) * ref_stride)), zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + +void vpx_get16x16var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 16; ++i) { + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + + src += src_stride; + ref += ref_stride; + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0) + + (int16_t)_mm_extract_epi16(vsum, 1); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + + +static void variance_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + getNxMvar_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 4); +} + +unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 4, + sse, &sum, get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); +} + +unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 4, 8, + sse, &sum, get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 6); +} + +unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 8, + sse, &sum, vpx_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); +} + +unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 16, + sse, &sum, vpx_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); +} + +unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 8); +} + +unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 64, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h index 0106a45d6e4..7502f906325 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h @@ -38,4 +38,16 @@ #define __builtin_prefetch(x) #endif +/* Shift down with rounding */ +#define ROUND_POWER_OF_TWO(value, n) \ + (((value) + (1 << ((n) - 1))) >> (n)) + +#define ALIGN_POWER_OF_TWO(value, n) \ + (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) + +#if CONFIG_VP9_HIGHBITDEPTH +#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1)) +#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1)) +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // VPX_PORTS_MEM_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/msvc.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/msvc.h new file mode 100644 index 00000000000..43a36e76184 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/msvc.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_MSVC_H_ +#define VPX_PORTS_MSVC_H_ +#ifdef _MSC_VER + +#include "./vpx_config.h" + +# if _MSC_VER < 1900 // VS2015 provides snprintf +# define snprintf _snprintf +# endif // _MSC_VER < 1900 + +#endif // _MSC_VER +#endif // VPX_PORTS_MSVC_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk index a7275431fe9..ab7fc4ac7a3 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk +++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk @@ -11,13 +11,14 @@ PORTS_SRCS-yes += vpx_ports.mk -PORTS_SRCS-$(BUILD_LIBVPX) += mem.h -PORTS_SRCS-$(BUILD_LIBVPX) += vpx_timer.h +PORTS_SRCS-yes += mem.h +PORTS_SRCS-yes += msvc.h +PORTS_SRCS-yes += vpx_timer.h ifeq ($(ARCH_X86)$(ARCH_X86_64),yes) -PORTS_SRCS-$(BUILD_LIBVPX) += emms.asm -PORTS_SRCS-$(BUILD_LIBVPX) += x86.h -PORTS_SRCS-$(BUILD_LIBVPX) += x86_abi_support.asm +PORTS_SRCS-yes += emms.asm +PORTS_SRCS-yes += x86.h +PORTS_SRCS-yes += x86_abi_support.asm endif PORTS_SRCS-$(ARCH_ARM) += arm_cpudetect.c diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c index 995c45b6ab6..dab324edfcc 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ - +#include "./vpx_scale_rtcd.h" #include "vpx_scale/vpx_scale.h" #include "vpx_mem/vpx_mem.h" /**************************************************************************** diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c index 089e673757c..15e4ba87e72 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c @@ -22,6 +22,7 @@ ****************************************************************************/ #include "./vpx_scale_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_scale/vpx_scale.h" #include "vpx_scale/yv12config.h" typedef struct { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c index 169c2ab2d73..7582792d939 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c @@ -12,9 +12,7 @@ #include "vpx_scale/yv12config.h" #include "vpx_mem/vpx_mem.h" -#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH -#include "vp9/common/vp9_common.h" -#endif +#include "vpx_ports/mem.h" /**************************************************************************** * Exports diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c index 6214a12189c..086e2f398fb 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c @@ -10,8 +10,10 @@ #include <assert.h> #include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" #include "vpx_scale/yv12config.h" #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH #include "vp9/common/vp9_common.h" diff --git a/chromium/third_party/libvpx/source/libvpx/vpxdec.c b/chromium/third_party/libvpx/source/libvpx/vpxdec.c index 8c938df8de6..baa61151ca8 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpxdec.c +++ b/chromium/third_party/libvpx/source/libvpx/vpxdec.c @@ -106,24 +106,25 @@ static const arg_def_t *all_args[] = { }; #if CONFIG_VP8_DECODER -static const arg_def_t addnoise_level = ARG_DEF(NULL, "noise-level", 1, - "Enable VP8 postproc add noise"); -static const arg_def_t deblock = ARG_DEF(NULL, "deblock", 0, - "Enable VP8 deblocking"); -static const arg_def_t demacroblock_level = ARG_DEF(NULL, "demacroblock-level", 1, - "Enable VP8 demacroblocking, w/ level"); -static const arg_def_t pp_debug_info = ARG_DEF(NULL, "pp-debug-info", 1, - "Enable VP8 visible debug info"); -static const arg_def_t pp_disp_ref_frame = ARG_DEF(NULL, "pp-dbg-ref-frame", 1, - "Display only selected reference frame per macro block"); -static const arg_def_t pp_disp_mb_modes = ARG_DEF(NULL, "pp-dbg-mb-modes", 1, - "Display only selected macro block modes"); -static const arg_def_t pp_disp_b_modes = ARG_DEF(NULL, "pp-dbg-b-modes", 1, - "Display only selected block modes"); -static const arg_def_t pp_disp_mvs = ARG_DEF(NULL, "pp-dbg-mvs", 1, - "Draw only selected motion vectors"); -static const arg_def_t mfqe = ARG_DEF(NULL, "mfqe", 0, - "Enable multiframe quality enhancement"); +static const arg_def_t addnoise_level = ARG_DEF( + NULL, "noise-level", 1, "Enable VP8 postproc add noise"); +static const arg_def_t deblock = ARG_DEF( + NULL, "deblock", 0, "Enable VP8 deblocking"); +static const arg_def_t demacroblock_level = ARG_DEF( + NULL, "demacroblock-level", 1, "Enable VP8 demacroblocking, w/ level"); +static const arg_def_t pp_debug_info = ARG_DEF( + NULL, "pp-debug-info", 1, "Enable VP8 visible debug info"); +static const arg_def_t pp_disp_ref_frame = ARG_DEF( + NULL, "pp-dbg-ref-frame", 1, + "Display only selected reference frame per macro block"); +static const arg_def_t pp_disp_mb_modes = ARG_DEF( + NULL, "pp-dbg-mb-modes", 1, "Display only selected macro block modes"); +static const arg_def_t pp_disp_b_modes = ARG_DEF( + NULL, "pp-dbg-b-modes", 1, "Display only selected block modes"); +static const arg_def_t pp_disp_mvs = ARG_DEF( + NULL, "pp-dbg-mvs", 1, "Draw only selected motion vectors"); +static const arg_def_t mfqe = ARG_DEF( + NULL, "mfqe", 0, "Enable multiframe quality enhancement"); static const arg_def_t *vp8_pp_args[] = { &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info, @@ -169,7 +170,7 @@ static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst, } #endif -void usage_exit() { +void usage_exit(void) { int i; fprintf(stderr, "Usage: %s <options> filename\n\n" @@ -312,7 +313,7 @@ static void write_image_file(const vpx_image_t *img, const int planes[3], } } -int file_is_raw(struct VpxInputContext *input) { +static int file_is_raw(struct VpxInputContext *input) { uint8_t buf[32]; int is_raw = 0; vpx_codec_stream_info_t si; @@ -343,7 +344,7 @@ int file_is_raw(struct VpxInputContext *input) { return is_raw; } -void show_progress(int frame_in, int frame_out, uint64_t dx_time) { +static void show_progress(int frame_in, int frame_out, uint64_t dx_time) { fprintf(stderr, "%d decoded frames/%d showed frames in %"PRId64" us (%.2f fps)\r", frame_in, frame_out, dx_time, @@ -365,8 +366,8 @@ struct ExternalFrameBufferList { // Application private data passed into the set function. |min_size| is the // minimum size in bytes needed to decode the next frame. |fb| pointer to the // frame buffer. -int get_vp9_frame_buffer(void *cb_priv, size_t min_size, - vpx_codec_frame_buffer_t *fb) { +static int get_vp9_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { int i; struct ExternalFrameBufferList *const ext_fb_list = (struct ExternalFrameBufferList *)cb_priv; @@ -403,8 +404,8 @@ int get_vp9_frame_buffer(void *cb_priv, size_t min_size, // Callback used by libvpx when there are no references to the frame buffer. // |cb_priv| user private data passed into the set function. |fb| pointer // to the frame buffer. -int release_vp9_frame_buffer(void *cb_priv, - vpx_codec_frame_buffer_t *fb) { +static int release_vp9_frame_buffer(void *cb_priv, + vpx_codec_frame_buffer_t *fb) { struct ExternalFrameBuffer *const ext_fb = (struct ExternalFrameBuffer *)fb->priv; (void)cb_priv; @@ -412,9 +413,9 @@ int release_vp9_frame_buffer(void *cb_priv, return 0; } -void generate_filename(const char *pattern, char *out, size_t q_len, - unsigned int d_w, unsigned int d_h, - unsigned int frame_in) { +static void generate_filename(const char *pattern, char *out, size_t q_len, + unsigned int d_w, unsigned int d_h, + unsigned int frame_in) { const char *p = pattern; char *q = out; @@ -536,7 +537,7 @@ static int img_shifted_realloc_required(const vpx_image_t *img, } #endif -int main_loop(int argc, const char **argv_) { +static int main_loop(int argc, const char **argv_) { vpx_codec_ctx_t decoder; char *fn = NULL; int i; @@ -813,34 +814,42 @@ int main_loop(int argc, const char **argv_) { fprintf(stderr, "%s\n", decoder.name); #if CONFIG_VP8_DECODER - if (vp8_pp_cfg.post_proc_flag && vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg)) { - fprintf(stderr, "Failed to configure postproc: %s\n", vpx_codec_error(&decoder)); + fprintf(stderr, "Failed to configure postproc: %s\n", + vpx_codec_error(&decoder)); return EXIT_FAILURE; } if (vp8_dbg_color_ref_frame - && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, vp8_dbg_color_ref_frame)) { - fprintf(stderr, "Failed to configure reference block visualizer: %s\n", vpx_codec_error(&decoder)); + && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, + vp8_dbg_color_ref_frame)) { + fprintf(stderr, "Failed to configure reference block visualizer: %s\n", + vpx_codec_error(&decoder)); return EXIT_FAILURE; } if (vp8_dbg_color_mb_modes - && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, vp8_dbg_color_mb_modes)) { - fprintf(stderr, "Failed to configure macro block visualizer: %s\n", vpx_codec_error(&decoder)); + && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, + vp8_dbg_color_mb_modes)) { + fprintf(stderr, "Failed to configure macro block visualizer: %s\n", + vpx_codec_error(&decoder)); return EXIT_FAILURE; } if (vp8_dbg_color_b_modes - && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, vp8_dbg_color_b_modes)) { - fprintf(stderr, "Failed to configure block visualizer: %s\n", vpx_codec_error(&decoder)); + && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, + vp8_dbg_color_b_modes)) { + fprintf(stderr, "Failed to configure block visualizer: %s\n", + vpx_codec_error(&decoder)); return EXIT_FAILURE; } if (vp8_dbg_display_mv - && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv)) { - fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", vpx_codec_error(&decoder)); + && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, + vp8_dbg_display_mv)) { + fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", + vpx_codec_error(&decoder)); return EXIT_FAILURE; } #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vpxenc.c b/chromium/third_party/libvpx/source/libvpx/vpxenc.c index 851d43291cd..8bbb9fc6a20 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpxenc.c +++ b/chromium/third_party/libvpx/source/libvpx/vpxenc.c @@ -99,7 +99,7 @@ static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal, va_end(ap); } -int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) { +static int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) { FILE *f = input_ctx->file; y4m_input *y4m = &input_ctx->y4m; int shortread = 0; @@ -114,14 +114,14 @@ int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) { return !shortread; } -int file_is_y4m(const char detect[4]) { +static int file_is_y4m(const char detect[4]) { if (memcmp(detect, "YUV4", 4) == 0) { return 1; } return 0; } -int fourcc_is_ivf(const char detect[4]) { +static int fourcc_is_ivf(const char detect[4]) { if (memcmp(detect, "DKIF", 4) == 0) { return 1; } @@ -330,10 +330,6 @@ static const arg_def_t sharpness = ARG_DEF( NULL, "sharpness", 1, "Loop filter sharpness (0..7)"); static const arg_def_t static_thresh = ARG_DEF( NULL, "static-thresh", 1, "Motion detection threshold"); -static const arg_def_t cpu_used_vp8 = ARG_DEF( - NULL, "cpu-used", 1, "CPU Used (-16..16)"); -static const arg_def_t cpu_used_vp9 = ARG_DEF( - NULL, "cpu-used", 1, "CPU Used (-8..8)"); static const arg_def_t auto_altref = ARG_DEF( NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"); static const arg_def_t arnr_maxframes = ARG_DEF( @@ -353,17 +349,14 @@ static const arg_def_t cq_level = ARG_DEF( NULL, "cq-level", 1, "Constant/Constrained Quality level"); static const arg_def_t max_intra_rate_pct = ARG_DEF( NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"); -static const arg_def_t max_inter_rate_pct = ARG_DEF( - NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"); -static const arg_def_t gf_cbr_boost_pct = ARG_DEF( - NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)"); - -static const arg_def_t screen_content_mode = ARG_DEF(NULL, "screen-content-mode", 1, - "Screen content mode"); #if CONFIG_VP8_ENCODER +static const arg_def_t cpu_used_vp8 = ARG_DEF( + NULL, "cpu-used", 1, "CPU Used (-16..16)"); static const arg_def_t token_parts = ARG_DEF( NULL, "token-parts", 1, "Number of token partitions to use, log2"); +static const arg_def_t screen_content_mode = ARG_DEF( + NULL, "screen-content-mode", 1, "Screen content mode"); static const arg_def_t *vp8_args[] = { &cpu_used_vp8, &auto_altref, &noise_sens, &sharpness, &static_thresh, &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type, @@ -382,6 +375,8 @@ static const int vp8_arg_ctrl_map[] = { #endif #if CONFIG_VP9_ENCODER +static const arg_def_t cpu_used_vp9 = ARG_DEF( + NULL, "cpu-used", 1, "CPU Used (-8..8)"); static const arg_def_t tile_cols = ARG_DEF( NULL, "tile-columns", 1, "Number of tile columns to use, log2"); static const arg_def_t tile_rows = ARG_DEF( @@ -397,6 +392,10 @@ static const arg_def_t aq_mode = ARG_DEF( static const arg_def_t frame_periodic_boost = ARG_DEF( NULL, "frame-boost", 1, "Enable frame periodic boost (0: off (default), 1: on)"); +static const arg_def_t gf_cbr_boost_pct = ARG_DEF( + NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)"); +static const arg_def_t max_inter_rate_pct = ARG_DEF( + NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"); static const struct arg_enum_list color_space_enum[] = { { "unknown", VPX_CS_UNKNOWN }, @@ -467,8 +466,9 @@ static const int vp9_arg_ctrl_map[] = { static const arg_def_t *no_args[] = { NULL }; -void usage_exit() { +void usage_exit(void) { int i; + const int num_encoder = get_vpx_encoder_count(); fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n", exec_name); @@ -496,11 +496,15 @@ void usage_exit() { " in fractional seconds. Default is 1/1000.\n"); fprintf(stderr, "\nIncluded encoders:\n\n"); - for (i = 0; i < get_vpx_encoder_count(); ++i) { + for (i = 0; i < num_encoder; ++i) { const VpxInterface *const encoder = get_vpx_encoder_by_index(i); - fprintf(stderr, " %-6s - %s\n", - encoder->name, vpx_codec_iface_name(encoder->codec_interface())); + const char* defstr = (i == (num_encoder - 1)) ? "(default)" : ""; + fprintf(stderr, " %-6s - %s %s\n", + encoder->name, vpx_codec_iface_name(encoder->codec_interface()), + defstr); } + fprintf(stderr, "\n "); + fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n"); exit(EXIT_FAILURE); } @@ -793,8 +797,8 @@ struct stream_state { }; -void validate_positive_rational(const char *msg, - struct vpx_rational *rat) { +static void validate_positive_rational(const char *msg, + struct vpx_rational *rat) { if (rat->den < 0) { rat->num *= -1; rat->den *= -1; @@ -811,10 +815,14 @@ void validate_positive_rational(const char *msg, static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { char **argi, **argj; struct arg arg; + const int num_encoder = get_vpx_encoder_count(); + + if (num_encoder < 1) + die("Error: no valid encoder available\n"); /* Initialize default parameters */ memset(global, 0, sizeof(*global)); - global->codec = get_vpx_encoder_by_index(0); + global->codec = get_vpx_encoder_by_index(num_encoder - 1); global->passes = 0; global->color_type = I420; /* Assign default deadline to good quality */ @@ -919,7 +927,7 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { } -void open_input_file(struct VpxInputContext *input) { +static void open_input_file(struct VpxInputContext *input) { /* Parse certain options from the input file, if possible */ input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") : set_binary_mode(stdin); @@ -935,6 +943,10 @@ void open_input_file(struct VpxInputContext *input) { rewind(input->file); } + /* Default to 1:1 pixel aspect ratio. */ + input->pixel_aspect_ratio.numerator = 1; + input->pixel_aspect_ratio.denominator = 1; + /* For RAW input sources, these bytes will applied on the first frame * in read_frame(). */ @@ -948,6 +960,8 @@ void open_input_file(struct VpxInputContext *input) { input->file_type = FILE_TYPE_Y4M; input->width = input->y4m.pic_w; input->height = input->y4m.pic_h; + input->pixel_aspect_ratio.numerator = input->y4m.par_n; + input->pixel_aspect_ratio.denominator = input->y4m.par_d; input->framerate.numerator = input->y4m.fps_n; input->framerate.denominator = input->y4m.fps_d; input->fmt = input->y4m.vpx_fmt; @@ -1381,7 +1395,8 @@ static void show_stream_config(struct stream_state *stream, static void open_output_file(struct stream_state *stream, - struct VpxEncoderConfig *global) { + struct VpxEncoderConfig *global, + const struct VpxRational *pixel_aspect_ratio) { const char *fn = stream->config.out_fn; const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg; @@ -1402,7 +1417,8 @@ static void open_output_file(struct stream_state *stream, write_webm_file_header(&stream->ebml, cfg, &global->framerate, stream->config.stereo_fmt, - global->codec->fourcc); + global->codec->fourcc, + pixel_aspect_ratio); } #endif @@ -2035,7 +2051,8 @@ int main(int argc, const char **argv_) { } FOREACH_STREAM(setup_pass(stream, &global, pass)); - FOREACH_STREAM(open_output_file(stream, &global)); + FOREACH_STREAM(open_output_file(stream, &global, + &input.pixel_aspect_ratio)); FOREACH_STREAM(initialize_encoder(stream, &global)); #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH diff --git a/chromium/third_party/libvpx/source/libvpx/webmenc.cc b/chromium/third_party/libvpx/source/libvpx/webmenc.cc index a0e542b17cf..8212ee36cf6 100644 --- a/chromium/third_party/libvpx/source/libvpx/webmenc.cc +++ b/chromium/third_party/libvpx/source/libvpx/webmenc.cc @@ -24,7 +24,8 @@ void write_webm_file_header(struct EbmlGlobal *glob, const vpx_codec_enc_cfg_t *cfg, const struct vpx_rational *fps, stereo_format_t stereo_fmt, - unsigned int fourcc) { + unsigned int fourcc, + const struct VpxRational *par) { mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(glob->stream); mkvmuxer::Segment *const segment = new mkvmuxer::Segment(); segment->Init(writer); @@ -49,6 +50,15 @@ void write_webm_file_header(struct EbmlGlobal *glob, segment->GetTrackByNumber(video_track_id)); video_track->SetStereoMode(stereo_fmt); video_track->set_codec_id(fourcc == VP8_FOURCC ? "V_VP8" : "V_VP9"); + if (par->numerator > 1 || par->denominator > 1) { + // TODO(fgalligan): Add support of DisplayUnit, Display Aspect Ratio type + // to WebM format. + const uint64_t display_width = + static_cast<uint64_t>(((cfg->g_w * par->numerator * 1.0) / + par->denominator) + .5); + video_track->set_display_width(display_width); + video_track->set_display_height(cfg->g_h); + } if (glob->debug) { video_track->set_uid(kDebugTrackUid); } diff --git a/chromium/third_party/libvpx/source/libvpx/webmenc.h b/chromium/third_party/libvpx/source/libvpx/webmenc.h index 0ac606be483..c255d3de669 100644 --- a/chromium/third_party/libvpx/source/libvpx/webmenc.h +++ b/chromium/third_party/libvpx/source/libvpx/webmenc.h @@ -42,7 +42,8 @@ void write_webm_file_header(struct EbmlGlobal *glob, const vpx_codec_enc_cfg_t *cfg, const struct vpx_rational *fps, stereo_format_t stereo_fmt, - unsigned int fourcc); + unsigned int fourcc, + const struct VpxRational *par); void write_webm_block(struct EbmlGlobal *glob, const vpx_codec_enc_cfg_t *cfg, |