summaryrefslogtreecommitdiff
path: root/chromium/third_party/libvpx
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2017-07-17 13:57:45 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2017-07-19 13:44:40 +0000
commit6ec7b8da05d21a3878bd21c691b41e675d74bb1c (patch)
treeb87f250bc19413750b9bb9cdbf2da20ef5014820 /chromium/third_party/libvpx
parentec02ee4181c49b61fce1c8fb99292dbb8139cc90 (diff)
downloadqtwebengine-chromium-6ec7b8da05d21a3878bd21c691b41e675d74bb1c.tar.gz
BASELINE: Update Chromium to 60.0.3112.70
Change-Id: I9911c2280a014d4632f254857876a395d4baed2d Reviewed-by: Alexandru Croitor <alexandru.croitor@qt.io>
Diffstat (limited to 'chromium/third_party/libvpx')
-rw-r--r--chromium/third_party/libvpx/README.chromium4
-rw-r--r--chromium/third_party/libvpx/libvpx_srcs.gni43
-rw-r--r--chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h8
-rw-r--r--chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h33
-rw-r--r--chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h8
-rw-r--r--chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h33
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h10
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h55
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h8
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h33
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h5
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h8
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h33
-rw-r--r--chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h13
-rw-r--r--chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h46
-rw-r--r--chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h23
-rw-r--r--chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h121
-rw-r--r--chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h5
-rw-r--r--chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h5
-rw-r--r--chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h23
-rw-r--r--chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h133
-rw-r--r--chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h23
-rw-r--r--chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h121
-rw-r--r--chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h23
-rw-r--r--chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h133
-rw-r--r--chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h13
-rw-r--r--chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h46
-rw-r--r--chromium/third_party/libvpx/source/config/vpx_version.h6
-rw-r--r--chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h23
-rw-r--r--chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h121
-rw-r--r--chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h23
-rw-r--r--chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h133
-rw-r--r--chromium/third_party/libvpx/source/libvpx/build/make/Makefile4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/build/make/configure.sh4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c10
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c25
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl17
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c27
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c42
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c352
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c285
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c19
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c25
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c105
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c37
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c490
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h27
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c242
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_frame_scale.c9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c54
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_multi_thread.c24
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c30
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c173
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c8
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h23
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c61
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c140
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h10
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c24
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c75
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/temporal_filter_sse4.c375
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_avx2.c107
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c73
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm19
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm212
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c92
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c15
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c38
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c26
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c20
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h52
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h96
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c94
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c302
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c73
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/intrapred_vsx.c749
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/sad_vsx.c102
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/types_vsx.h48
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/variance_vsx.c103
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c418
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c81
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.h4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk18
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl425
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_pred_sse2.c69
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/convolve.h30
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c1106
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c244
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c41
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c129
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c216
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h33
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c769
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h28
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h55
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpxdec.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/webmdec.cc5
134 files changed, 6770 insertions, 3356 deletions
diff --git a/chromium/third_party/libvpx/README.chromium b/chromium/third_party/libvpx/README.chromium
index 414280e6fbf..04399d3d7f6 100644
--- a/chromium/third_party/libvpx/README.chromium
+++ b/chromium/third_party/libvpx/README.chromium
@@ -5,9 +5,9 @@ License: BSD
License File: source/libvpx/LICENSE
Security Critical: yes
-Date: Monday April 10 2017
+Date: Monday May 22 2017
Branch: master
-Commit: f22b828d685adee4c7a561990302e2d21b5e0047
+Commit: b3bf91bdc60220c004a22d21c867cc392e684b81
Description:
Contains the sources used to compile libvpx binaries used by Google Chrome and
diff --git a/chromium/third_party/libvpx/libvpx_srcs.gni b/chromium/third_party/libvpx/libvpx_srcs.gni
index a39f4572712..08b2a5c2d75 100644
--- a/chromium/third_party/libvpx/libvpx_srcs.gni
+++ b/chromium/third_party/libvpx/libvpx_srcs.gni
@@ -324,7 +324,9 @@ libvpx_srcs_x86 = [
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/txfm_common_sse2.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_asm_stubs.c",
"//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h",
@@ -372,7 +374,6 @@ libvpx_srcs_x86_assembly = [
"//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm",
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/add_noise_sse2.asm",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm",
@@ -414,7 +415,12 @@ libvpx_srcs_x86_sse2 = [
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_pred_sse2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_sse2.c",
@@ -432,12 +438,15 @@ libvpx_srcs_x86_ssse3 = [
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c",
]
-libvpx_srcs_x86_sse4_1 =
- [ "//third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse4.c" ]
+libvpx_srcs_x86_sse4_1 = [
+ "//third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse4.c",
+ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/temporal_filter_sse4.c",
+]
libvpx_srcs_x86_avx = [ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c" ]
libvpx_srcs_x86_avx2 = [
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c",
+ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_avx2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/loopfilter_avx2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c",
@@ -766,7 +775,9 @@ libvpx_srcs_x86_64 = [
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/txfm_common_sse2.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_asm_stubs.c",
"//third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h",
@@ -816,7 +827,6 @@ libvpx_srcs_x86_64_assembly = [
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm",
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/add_noise_sse2.asm",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm",
@@ -863,7 +873,12 @@ libvpx_srcs_x86_64_sse2 = [
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_pred_sse2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_variance_sse2.c",
@@ -881,12 +896,15 @@ libvpx_srcs_x86_64_ssse3 = [
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c",
]
-libvpx_srcs_x86_64_sse4_1 =
- [ "//third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse4.c" ]
+libvpx_srcs_x86_64_sse4_1 = [
+ "//third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse4.c",
+ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/temporal_filter_sse4.c",
+]
libvpx_srcs_x86_64_avx = [ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c" ]
libvpx_srcs_x86_64_avx2 = [
- "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c",
+ "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_avx2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/loopfilter_avx2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c",
@@ -1434,6 +1452,7 @@ libvpx_srcs_arm_neon = [
"//third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c",
"//third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h",
"//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c",
+ "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/vp9_alt_ref_aq.c",
@@ -1533,6 +1552,7 @@ libvpx_srcs_arm_neon = [
"//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c",
@@ -1545,6 +1565,7 @@ libvpx_srcs_arm_neon = [
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c",
@@ -1903,6 +1924,7 @@ libvpx_srcs_arm_neon_cpu_detect = [
"//third_party/libvpx/source/libvpx/vpx/vpx_integer.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/avg.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/bitreader.c",
@@ -2000,10 +2022,12 @@ libvpx_srcs_arm_neon_cpu_detect_neon = [
"//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c",
"//third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c",
+ "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c",
@@ -2226,6 +2250,7 @@ libvpx_srcs_arm64 = [
"//third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c",
"//third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h",
"//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c",
+ "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/vp9_alt_ref_aq.c",
@@ -2325,6 +2350,7 @@ libvpx_srcs_arm64 = [
"//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c",
@@ -2340,6 +2366,7 @@ libvpx_srcs_arm64 = [
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_neon.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c",
diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h
index 957219d5a8a..3b104550883 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -37,7 +38,8 @@ int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, in
#define vp9_block_error_fp vp9_block_error_fp_neon
int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude);
-#define vp9_denoiser_filter vp9_denoiser_filter_c
+int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude);
+#define vp9_denoiser_filter vp9_denoiser_filter_neon
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
#define vp9_diamond_search_sad vp9_diamond_search_sad_c
@@ -85,10 +87,10 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int sk
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
void vp9_rtcd(void);
diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h
index 6d960874198..61c4f1fe737 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h
@@ -228,7 +228,8 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4 vpx_fdct4x4_c
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_neon
void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
@@ -676,20 +677,24 @@ uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_str
#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon
uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon
uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon
uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon
uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon
uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon
uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
@@ -698,17 +703,20 @@ uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride,
#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon
uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon
uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon
uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon
uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -758,14 +766,16 @@ unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, c
#define vpx_variance16x16 vpx_variance16x16_neon
unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x32 vpx_variance16x32_c
+unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_neon
unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vpx_variance16x8 vpx_variance16x8_neon
unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance32x16 vpx_variance32x16_c
+unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_neon
unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -794,7 +804,8 @@ unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co
#define vpx_variance8x16 vpx_variance8x16_neon
unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x4 vpx_variance8x4_c
+unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_neon
unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h
index 957219d5a8a..3b104550883 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -37,7 +38,8 @@ int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, in
#define vp9_block_error_fp vp9_block_error_fp_neon
int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude);
-#define vp9_denoiser_filter vp9_denoiser_filter_c
+int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude);
+#define vp9_denoiser_filter vp9_denoiser_filter_neon
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
#define vp9_diamond_search_sad vp9_diamond_search_sad_c
@@ -85,10 +87,10 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int sk
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
void vp9_rtcd(void);
diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h
index 6d960874198..61c4f1fe737 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h
@@ -228,7 +228,8 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4 vpx_fdct4x4_c
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_neon
void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
@@ -676,20 +677,24 @@ uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_str
#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon
uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon
uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon
uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon
uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon
uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon
uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
@@ -698,17 +703,20 @@ uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride,
#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon
uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon
uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon
uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon
uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -758,14 +766,16 @@ unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, c
#define vpx_variance16x16 vpx_variance16x16_neon
unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x32 vpx_variance16x32_c
+unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_neon
unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vpx_variance16x8 vpx_variance16x8_neon
unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance32x16 vpx_variance32x16_c
+unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_neon
unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -794,7 +804,8 @@ unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co
#define vpx_variance8x16 vpx_variance8x16_neon
unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x4 vpx_variance8x4_c
+unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_neon
unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
index 9129bf63688..015772d2b4b 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -37,7 +38,8 @@ int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, in
RTCD_EXTERN int64_t (*vp9_block_error_fp)(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude);
-#define vp9_denoiser_filter vp9_denoiser_filter_c
+int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude);
+RTCD_EXTERN int (*vp9_denoiser_filter)(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude);
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
#define vp9_diamond_search_sad vp9_diamond_search_sad_c
@@ -85,10 +87,10 @@ RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coef
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
void vp9_rtcd(void);
@@ -105,6 +107,8 @@ static void setup_rtcd_internal(void)
vp9_block_error_fp = vp9_block_error_fp_c;
if (flags & HAS_NEON) vp9_block_error_fp = vp9_block_error_fp_neon;
+ vp9_denoiser_filter = vp9_denoiser_filter_c;
+ if (flags & HAS_NEON) vp9_denoiser_filter = vp9_denoiser_filter_neon;
vp9_fdct8x8_quant = vp9_fdct8x8_quant_c;
if (flags & HAS_NEON) vp9_fdct8x8_quant = vp9_fdct8x8_quant_neon;
vp9_iht4x4_16_add = vp9_iht4x4_16_add_c;
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h
index f8e41363a38..c818a5184df 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h
@@ -228,7 +228,8 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4 vpx_fdct4x4_c
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct4x4)(const int16_t *input, tran_low_t *output, int stride);
void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
@@ -676,20 +677,24 @@ uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_str
RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
@@ -698,17 +703,20 @@ uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride,
#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -758,14 +766,16 @@ unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, c
RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x32 vpx_variance16x32_c
+unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance32x16 vpx_variance32x16_c
+unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -794,7 +804,8 @@ unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co
RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x4 vpx_variance8x4_c
+unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -887,6 +898,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_NEON) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_neon;
vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c;
if (flags & HAS_NEON) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_neon;
+ vpx_fdct4x4 = vpx_fdct4x4_c;
+ if (flags & HAS_NEON) vpx_fdct4x4 = vpx_fdct4x4_neon;
vpx_fdct8x8 = vpx_fdct8x8_c;
if (flags & HAS_NEON) vpx_fdct8x8 = vpx_fdct8x8_neon;
vpx_fdct8x8_1 = vpx_fdct8x8_1_c;
@@ -997,10 +1010,24 @@ static void setup_rtcd_internal(void)
if (flags & HAS_NEON) vpx_satd = vpx_satd_neon;
vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_c;
if (flags & HAS_NEON) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_neon;
+ vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_c;
+ if (flags & HAS_NEON) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_neon;
+ vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_c;
+ if (flags & HAS_NEON) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_neon;
+ vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_c;
+ if (flags & HAS_NEON) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_neon;
vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_c;
if (flags & HAS_NEON) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_neon;
+ vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_c;
+ if (flags & HAS_NEON) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_neon;
+ vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_c;
+ if (flags & HAS_NEON) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_neon;
vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_c;
if (flags & HAS_NEON) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_neon;
+ vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_c;
+ if (flags & HAS_NEON) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_neon;
+ vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_c;
+ if (flags & HAS_NEON) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_neon;
vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_c;
if (flags & HAS_NEON) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_neon;
vpx_subtract_block = vpx_subtract_block_c;
@@ -1023,8 +1050,12 @@ static void setup_rtcd_internal(void)
if (flags & HAS_NEON) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_neon;
vpx_variance16x16 = vpx_variance16x16_c;
if (flags & HAS_NEON) vpx_variance16x16 = vpx_variance16x16_neon;
+ vpx_variance16x32 = vpx_variance16x32_c;
+ if (flags & HAS_NEON) vpx_variance16x32 = vpx_variance16x32_neon;
vpx_variance16x8 = vpx_variance16x8_c;
if (flags & HAS_NEON) vpx_variance16x8 = vpx_variance16x8_neon;
+ vpx_variance32x16 = vpx_variance32x16_c;
+ if (flags & HAS_NEON) vpx_variance32x16 = vpx_variance32x16_neon;
vpx_variance32x32 = vpx_variance32x32_c;
if (flags & HAS_NEON) vpx_variance32x32 = vpx_variance32x32_neon;
vpx_variance32x64 = vpx_variance32x64_c;
@@ -1035,6 +1066,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_NEON) vpx_variance64x64 = vpx_variance64x64_neon;
vpx_variance8x16 = vpx_variance8x16_c;
if (flags & HAS_NEON) vpx_variance8x16 = vpx_variance8x16_neon;
+ vpx_variance8x4 = vpx_variance8x4_c;
+ if (flags & HAS_NEON) vpx_variance8x4 = vpx_variance8x4_neon;
vpx_variance8x8 = vpx_variance8x8_c;
if (flags & HAS_NEON) vpx_variance8x8 = vpx_variance8x8_neon;
vpx_vector_var = vpx_vector_var_c;
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h
index 957219d5a8a..3b104550883 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -37,7 +38,8 @@ int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, in
#define vp9_block_error_fp vp9_block_error_fp_neon
int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude);
-#define vp9_denoiser_filter vp9_denoiser_filter_c
+int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude);
+#define vp9_denoiser_filter vp9_denoiser_filter_neon
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
#define vp9_diamond_search_sad vp9_diamond_search_sad_c
@@ -85,10 +87,10 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int sk
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
void vp9_rtcd(void);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h
index 6d960874198..61c4f1fe737 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h
@@ -228,7 +228,8 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4 vpx_fdct4x4_c
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_neon
void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
@@ -676,20 +677,24 @@ uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_str
#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon
uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon
uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon
uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon
uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon
uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon
uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
@@ -698,17 +703,20 @@ uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride,
#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon
uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon
uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon
uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon
uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -758,14 +766,16 @@ unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, c
#define vpx_variance16x16 vpx_variance16x16_neon
unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x32 vpx_variance16x32_c
+unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_neon
unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vpx_variance16x8 vpx_variance16x8_neon
unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance32x16 vpx_variance32x16_c
+unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_neon
unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -794,7 +804,8 @@ unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co
#define vpx_variance8x16 vpx_variance8x16_neon
unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x4 vpx_variance8x4_c
+unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_neon
unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h
index 5d9d14d08a0..e259775c0e3 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -80,10 +81,10 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
void vp9_rtcd(void);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h
index 957219d5a8a..3b104550883 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -37,7 +38,8 @@ int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, in
#define vp9_block_error_fp vp9_block_error_fp_neon
int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude);
-#define vp9_denoiser_filter vp9_denoiser_filter_c
+int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude);
+#define vp9_denoiser_filter vp9_denoiser_filter_neon
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
#define vp9_diamond_search_sad vp9_diamond_search_sad_c
@@ -85,10 +87,10 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int sk
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
void vp9_rtcd(void);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h
index 6d960874198..61c4f1fe737 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h
@@ -228,7 +228,8 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4 vpx_fdct4x4_c
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_neon
void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
@@ -676,20 +677,24 @@ uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_str
#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon
uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon
uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon
uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon
uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon
uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon
uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
@@ -698,17 +703,20 @@ uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride,
#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon
uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon
uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon
uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon
uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -758,14 +766,16 @@ unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, c
#define vpx_variance16x16 vpx_variance16x16_neon
unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x32 vpx_variance16x32_c
+unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_neon
unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
#define vpx_variance16x8 vpx_variance16x8_neon
unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance32x16 vpx_variance32x16_c
+unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_neon
unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
@@ -794,7 +804,8 @@ unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, co
#define vpx_variance8x16 vpx_variance8x16_neon
unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x4 vpx_variance8x4_c
+unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_neon
unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h
index 4a32a38e064..0e14191aaec 100644
--- a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -80,13 +81,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i
void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -104,7 +105,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in
void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
-void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
@@ -122,10 +123,10 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
void vp9_rtcd(void);
diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h
index 8d126acaa9f..a09ed559657 100644
--- a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h
@@ -652,28 +652,28 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
-void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
-void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -832,49 +832,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h
index 6b04a45895d..c178d191672 100644
--- a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i
void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in
void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
-void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
@@ -141,13 +142,13 @@ RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coef
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
void vp9_rtcd(void);
@@ -198,7 +199,7 @@ static void setup_rtcd_internal(void)
vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
- if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2;
+ if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1;
}
#endif
diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h
index 2ebbf6e3fa3..49450cda3db 100644
--- a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h
@@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p);
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -824,31 +825,39 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
@@ -1015,56 +1024,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -2030,6 +2039,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2;
vpx_avg_8x8 = vpx_avg_8x8_c;
if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2;
+ vpx_comp_avg_pred = vpx_comp_avg_pred_c;
+ if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2;
vpx_convolve8 = vpx_convolve8_c;
if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2;
if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3;
@@ -2360,10 +2371,24 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) vpx_highbd_8_variance8x16 = vpx_highbd_8_variance8x16_sse2;
vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_c;
if (flags & HAS_SSE2) vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_sse2;
+ vpx_highbd_convolve8 = vpx_highbd_convolve8_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2;
+ vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2;
+ vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2;
+ vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2;
+ vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2;
+ vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2;
vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_c;
if (flags & HAS_SSE2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2;
vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_c;
if (flags & HAS_SSE2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2;
vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_c;
if (flags & HAS_SSE2) vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_sse2;
vpx_highbd_dc_predictor_32x32 = vpx_highbd_dc_predictor_32x32_c;
diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h
index c0174f2ffa8..c7f905eb1e8 100644
--- a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -80,10 +81,10 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
void vp9_rtcd(void);
diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h
index c0174f2ffa8..c7f905eb1e8 100644
--- a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -80,10 +81,10 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
void vp9_rtcd(void);
diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h
index 58a2d4e7268..56d5840ce95 100644
--- a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i
void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in
void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
-void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
@@ -143,13 +144,13 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
void vp9_rtcd(void);
@@ -176,6 +177,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3;
vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
+ vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
+ if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1;
}
#endif
diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h
index 889fca7c45a..b2403c36bc4 100644
--- a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h
@@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
#define vpx_avg_8x8 vpx_avg_8x8_sse2
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_sse2
void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -825,37 +826,45 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8 vpx_highbd_convolve8_sse2
-
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_sse2
-
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_sse2
-
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_sse2
-
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_sse2
-
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_sse2
-
-void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2
-
-void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
@@ -1022,56 +1031,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_256_add_sse2
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -2082,6 +2091,22 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2;
vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2;
if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3;
+ vpx_highbd_convolve8 = vpx_highbd_convolve8_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2;
+ vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2;
+ vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2;
+ vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2;
+ vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2;
+ vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2;
+ vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2;
+ vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2;
vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2;
if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3;
vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2;
diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h
index 6b04a45895d..c178d191672 100644
--- a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i
void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in
void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
-void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
@@ -141,13 +142,13 @@ RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coef
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
void vp9_rtcd(void);
@@ -198,7 +199,7 @@ static void setup_rtcd_internal(void)
vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
- if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2;
+ if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1;
}
#endif
diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h
index 2ebbf6e3fa3..49450cda3db 100644
--- a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h
@@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p);
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -824,31 +825,39 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
@@ -1015,56 +1024,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -2030,6 +2039,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2;
vpx_avg_8x8 = vpx_avg_8x8_c;
if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2;
+ vpx_comp_avg_pred = vpx_comp_avg_pred_c;
+ if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2;
vpx_convolve8 = vpx_convolve8_c;
if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2;
if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3;
@@ -2360,10 +2371,24 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) vpx_highbd_8_variance8x16 = vpx_highbd_8_variance8x16_sse2;
vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_c;
if (flags & HAS_SSE2) vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_sse2;
+ vpx_highbd_convolve8 = vpx_highbd_convolve8_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2;
+ vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2;
+ vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2;
+ vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2;
+ vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2;
+ vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2;
vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_c;
if (flags & HAS_SSE2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2;
vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_c;
if (flags & HAS_SSE2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2;
vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_c;
if (flags & HAS_SSE2) vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_sse2;
vpx_highbd_dc_predictor_32x32 = vpx_highbd_dc_predictor_32x32_c;
diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h
index 58a2d4e7268..56d5840ce95 100644
--- a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i
void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in
void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
-void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
@@ -143,13 +144,13 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
void vp9_rtcd(void);
@@ -176,6 +177,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3;
vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
+ vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
+ if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1;
}
#endif
diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h
index 889fca7c45a..b2403c36bc4 100644
--- a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h
@@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
#define vpx_avg_8x8 vpx_avg_8x8_sse2
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_sse2
void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -825,37 +826,45 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8 vpx_highbd_convolve8_sse2
-
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_sse2
-
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_sse2
-
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_sse2
-
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_sse2
-
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_sse2
-
-void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2
-
-void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
@@ -1022,56 +1031,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_256_add_sse2
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -2082,6 +2091,22 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2;
vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2;
if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3;
+ vpx_highbd_convolve8 = vpx_highbd_convolve8_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2;
+ vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2;
+ vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2;
+ vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2;
+ vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2;
+ vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2;
+ vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2;
+ vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2;
vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2;
if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3;
vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2;
diff --git a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h
index 4a32a38e064..0e14191aaec 100644
--- a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -80,13 +81,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i
void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -104,7 +105,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in
void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
-void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
@@ -122,10 +123,10 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
void vp9_rtcd(void);
diff --git a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h
index 8d126acaa9f..a09ed559657 100644
--- a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h
@@ -652,28 +652,28 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
-void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
-void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -832,49 +832,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
diff --git a/chromium/third_party/libvpx/source/config/vpx_version.h b/chromium/third_party/libvpx/source/config/vpx_version.h
index ebb12f2b240..e8dde9d3ea8 100644
--- a/chromium/third_party/libvpx/source/config/vpx_version.h
+++ b/chromium/third_party/libvpx/source/config/vpx_version.h
@@ -1,7 +1,7 @@
#define VERSION_MAJOR 1
#define VERSION_MINOR 6
#define VERSION_PATCH 1
-#define VERSION_EXTRA "446-gf22b828d6"
+#define VERSION_EXTRA "657-gb3bf91bdc"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1-446-gf22b828d6"
-#define VERSION_STRING " v1.6.1-446-gf22b828d6"
+#define VERSION_STRING_NOSP "v1.6.1-657-gb3bf91bdc"
+#define VERSION_STRING " v1.6.1-657-gb3bf91bdc"
diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h
index 6b04a45895d..c178d191672 100644
--- a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i
void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in
void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
-void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
@@ -141,13 +142,13 @@ RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coef
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
void vp9_rtcd(void);
@@ -198,7 +199,7 @@ static void setup_rtcd_internal(void)
vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
- if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2;
+ if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1;
}
#endif
diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h
index 2ebbf6e3fa3..49450cda3db 100644
--- a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h
@@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p);
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -824,31 +825,39 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
@@ -1015,56 +1024,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_38_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint16_t *dest, int stride, int bd);
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -2030,6 +2039,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2;
vpx_avg_8x8 = vpx_avg_8x8_c;
if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2;
+ vpx_comp_avg_pred = vpx_comp_avg_pred_c;
+ if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2;
vpx_convolve8 = vpx_convolve8_c;
if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2;
if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3;
@@ -2360,10 +2371,24 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) vpx_highbd_8_variance8x16 = vpx_highbd_8_variance8x16_sse2;
vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_c;
if (flags & HAS_SSE2) vpx_highbd_8_variance8x8 = vpx_highbd_8_variance8x8_sse2;
+ vpx_highbd_convolve8 = vpx_highbd_convolve8_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2;
+ vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2;
+ vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2;
+ vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2;
+ vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2;
+ vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_c;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2;
vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_c;
if (flags & HAS_SSE2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2;
vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_c;
if (flags & HAS_SSE2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2;
vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_c;
if (flags & HAS_SSE2) vpx_highbd_dc_predictor_16x16 = vpx_highbd_dc_predictor_16x16_sse2;
vpx_highbd_dc_predictor_32x32 = vpx_highbd_dc_predictor_32x32_c;
diff --git a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h
index 58a2d4e7268..56d5840ce95 100644
--- a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h
@@ -14,6 +14,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -95,13 +96,13 @@ void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, i
void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -119,7 +120,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, in
void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
-void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
@@ -143,13 +144,13 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
void vp9_rtcd(void);
@@ -176,6 +177,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3;
vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
+ vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
+ if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1;
}
#endif
diff --git a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h
index 889fca7c45a..b2403c36bc4 100644
--- a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h
@@ -28,7 +28,8 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
#define vpx_avg_8x8 vpx_avg_8x8_sse2
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_sse2
void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -825,37 +826,45 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8 vpx_highbd_convolve8_sse2
-
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_sse2
-
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_sse2
-
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_sse2
-
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_sse2
-
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_sse2
-
-void vpx_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2
-
-void vpx_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-void vpx_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
-#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_avg_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_horiz)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve8_vert)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_avg)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*vpx_highbd_convolve_copy)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
@@ -1022,56 +1031,56 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_256_add_sse2
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -2082,6 +2091,22 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2;
vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2;
if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3;
+ vpx_highbd_convolve8 = vpx_highbd_convolve8_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8 = vpx_highbd_convolve8_avx2;
+ vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg = vpx_highbd_convolve8_avg_avx2;
+ vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_horiz = vpx_highbd_convolve8_avg_horiz_avx2;
+ vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_avg_vert = vpx_highbd_convolve8_avg_vert_avx2;
+ vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_horiz = vpx_highbd_convolve8_horiz_avx2;
+ vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve8_vert = vpx_highbd_convolve8_vert_avx2;
+ vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_avg = vpx_highbd_convolve_avg_avx2;
+ vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_sse2;
+ if (flags & HAS_AVX2) vpx_highbd_convolve_copy = vpx_highbd_convolve_copy_avx2;
vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2;
if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3;
vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2;
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
index 0d29609ff8c..90522e5f63a 100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
@@ -141,8 +141,8 @@ $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
# POWER
-$(BUILD_PFX)%_vsx.c.d: CFLAGS += -mvsx
-$(BUILD_PFX)%_vsx.c.o: CFLAGS += -mvsx
+$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
+$(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx
$(BUILD_PFX)%.c.d: %.c
$(if $(quiet),@echo " [DEP] $@")
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
index dcfdfe1d2ba..fbe8b1b4580 100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
@@ -674,7 +674,6 @@ check_xcode_minimum_version() {
process_common_toolchain() {
if [ -z "$toolchain" ]; then
gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
-
# detect tgt_isa
case "$gcctarget" in
aarch64*)
@@ -697,6 +696,9 @@ process_common_toolchain() {
*sparc*)
tgt_isa=sparc
;;
+ power*64*-*)
+ tgt_isa=ppc64
+ ;;
power*)
tgt_isa=ppc
;;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
index 64d177581ed..b571d29d9a4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
@@ -728,6 +728,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
SPEED_FEATURES *sf = &cpi->sf;
int Mode = cpi->compressor_speed;
int Speed = cpi->Speed;
+ int Speed2;
int i;
VP8_COMMON *cm = &cpi->common;
int last_improved_quant = sf->improved_quant;
@@ -829,9 +830,16 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
cpi->mode_check_freq[THR_V_PRED] = cpi->mode_check_freq[THR_H_PRED] =
cpi->mode_check_freq[THR_B_PRED] =
speed_map(Speed, mode_check_freq_map_vhbpred);
- cpi->mode_check_freq[THR_NEW1] = speed_map(Speed, mode_check_freq_map_new1);
+
+ // For real-time mode at speed 10 keep the mode_check_freq threshold
+ // for NEW1 similar to that of speed 9.
+ Speed2 = Speed;
+ if (cpi->Speed == 10 && Mode == 2) Speed2 = RT(9);
+ cpi->mode_check_freq[THR_NEW1] = speed_map(Speed2, mode_check_freq_map_new1);
+
cpi->mode_check_freq[THR_NEW2] = cpi->mode_check_freq[THR_NEW3] =
speed_map(Speed, mode_check_freq_map_new2);
+
cpi->mode_check_freq[THR_SPLIT1] =
speed_map(Speed, mode_check_freq_map_split1);
cpi->mode_check_freq[THR_SPLIT2] = cpi->mode_check_freq[THR_SPLIT3] =
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
index 55957414cde..69069042cc2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
@@ -205,7 +205,7 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int tx_type, int bd) {
const highbd_transform_2d IHT_4[] = {
{ vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0
@@ -213,7 +213,6 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
{ vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2
{ vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3
};
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
int i, j;
tran_low_t out[4 * 4];
@@ -245,14 +244,13 @@ static const highbd_transform_2d HIGH_IHT_8[] = {
{ vpx_highbd_iadst8_c, vpx_highbd_iadst8_c } // ADST_ADST = 3
};
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int tx_type, int bd) {
int i, j;
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
tran_low_t temp_in[8], temp_out[8];
const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// Inverse transform row vectors.
for (i = 0; i < 8; ++i) {
@@ -279,14 +277,13 @@ static const highbd_transform_2d HIGH_IHT_16[] = {
{ vpx_highbd_iadst16_c, vpx_highbd_iadst16_c } // ADST_ADST = 3
};
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int tx_type, int bd) {
int i, j;
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// Rows
for (i = 0; i < 16; ++i) {
@@ -307,7 +304,7 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
}
// idct
-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd) {
if (eob > 1)
vpx_highbd_idct4x4_16_add(input, dest, stride, bd);
@@ -315,7 +312,7 @@ void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
vpx_highbd_idct4x4_1_add(input, dest, stride, bd);
}
-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd) {
if (eob > 1)
vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);
@@ -323,7 +320,7 @@ void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);
}
-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd) {
// If dc is 1, then input[0] is the reconstructed value, do not need
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -340,7 +337,7 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
}
}
-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd) {
// The calculation can be simplified if there are not many non-zero dct
// coefficients. Use eobs to separate different cases.
@@ -356,7 +353,7 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
}
}
-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd) {
// Non-zero coeff only in upper-left 8x8
if (eob == 1) {
@@ -372,7 +369,7 @@ void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
// iht
void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd) {
+ uint16_t *dest, int stride, int eob, int bd) {
if (tx_type == DCT_DCT)
vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
else
@@ -380,7 +377,7 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
}
void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd) {
+ uint16_t *dest, int stride, int eob, int bd) {
if (tx_type == DCT_DCT) {
vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
} else {
@@ -389,7 +386,7 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
}
void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd) {
+ uint16_t *dest, int stride, int eob, int bd) {
if (tx_type == DCT_DCT) {
vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
} else {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
index ea958a38c0e..3e83b8402de 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
@@ -57,22 +57,22 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob);
#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd);
-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd);
void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd);
+ uint16_t *dest, int stride, int eob, int bd);
void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd);
+ uint16_t *dest, int stride, int eob, int bd);
void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd);
+ uint16_t *dest, int stride, int eob, int bd);
#endif // CONFIG_VP9_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
index 8eb71268986..a108a65153b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
@@ -21,7 +21,7 @@
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_build_inter_predictor(
- const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
const InterpKernel *kernel, enum mv_precision precision, int x, int y,
int bd) {
@@ -190,7 +190,8 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- highbd_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+ highbd_inter_predictor(CONVERT_TO_SHORTPTR(pre), pre_buf->stride,
+ CONVERT_TO_SHORTPTR(dst), dst_buf->stride,
subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
xd->bd);
} else {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
index 4fed4f7f6ec..1b09b380d41 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
@@ -33,7 +33,7 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE void highbd_inter_predictor(
- const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
const int subpel_x, const int subpel_y, const struct scale_factors *sf,
int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) {
sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
@@ -68,7 +68,7 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_build_inter_predictor(
- const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
const InterpKernel *kernel, enum mv_precision precision, int x, int y,
int bd);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
index 10c779c01d3..baf63e97fa9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -7,6 +7,7 @@ print <<EOF
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
struct macroblockd;
@@ -101,11 +102,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
- add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
+ add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
- add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
+ add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
- add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+ add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
}
#
@@ -120,7 +121,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
#
if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
add_proto qw/int vp9_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
- specialize qw/vp9_denoiser_filter sse2/;
+ specialize qw/vp9_denoiser_filter neon sse2/;
}
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -197,8 +198,8 @@ $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
specialize qw/vp9_diamond_search_sad avx/;
-add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-specialize qw/vp9_temporal_filter_apply sse2 msa/;
+add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
+specialize qw/vp9_temporal_filter_apply sse4_1/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -217,7 +218,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+ add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
}
# End vp9_high encoder functions
@@ -225,7 +226,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# frame based scale
#
-add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst";
+add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler";
specialize qw/vp9_scale_and_extend_frame ssse3/;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index dcfc454aa0d..bb2dcf52bf5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -16,7 +16,6 @@
void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
__m128i in[2];
- const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
in[0] = load_input_data(input);
@@ -49,31 +48,7 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in[0] = _mm_srai_epi16(in[0], 4);
in[1] = _mm_srai_epi16(in[1], 4);
- // Reconstruction and Store
- {
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
- __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi32(d0,
- _mm_cvtsi32_si128(*(const int *)(dest + stride)));
- d2 = _mm_unpacklo_epi32(
- d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
- d0 = _mm_unpacklo_epi8(d0, zero);
- d2 = _mm_unpacklo_epi8(d2, zero);
- d0 = _mm_add_epi16(d0, in[0]);
- d2 = _mm_add_epi16(d2, in[1]);
- d0 = _mm_packus_epi16(d0, d2);
- // store result[0]
- *(int *)dest = _mm_cvtsi128_si32(d0);
- // store result[1]
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
- // store result[2]
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
- // store result[3]
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
- }
+ recon_and_store4x4_sse2(in, dest, stride);
}
void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
index f71f7d1eb41..0760f8c2398 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -189,21 +189,22 @@ static void inverse_transform_block_inter(MACROBLOCKD *xd, int plane,
assert(eob > 0);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
if (xd->lossless) {
- vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
} else {
switch (tx_size) {
case TX_4X4:
- vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_idct4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
break;
case TX_8X8:
- vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_idct8x8_add(dqcoeff, dst16, stride, eob, xd->bd);
break;
case TX_16X16:
- vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_idct16x16_add(dqcoeff, dst16, stride, eob, xd->bd);
break;
case TX_32X32:
- vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);
break;
default: assert(0 && "Invalid transform size");
}
@@ -256,21 +257,22 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane,
assert(eob > 0);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
if (xd->lossless) {
- vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
} else {
switch (tx_size) {
case TX_4X4:
- vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
break;
case TX_8X8:
- vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
break;
case TX_16X16:
- vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
break;
case TX_32X32:
- vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);
break;
default: assert(0 && "Invalid transform size");
}
@@ -451,24 +453,19 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
const struct scale_factors *sf, MACROBLOCKD *xd,
int w, int h, int ref, int xs, int ys) {
DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
- const uint8_t *buf_ptr;
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w, x0, y0,
b_w, b_h, frame_width, frame_height);
- buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
+ highbd_inter_predictor(mc_buf_high + border_offset, b_w,
+ CONVERT_TO_SHORTPTR(dst), dst_buf_stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
} else {
build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w, x0,
y0, b_w, b_h, frame_width, frame_height);
- buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
- }
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- highbd_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
- } else {
- inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, subpel_y, sf,
- w, h, ref, kernel, xs, ys);
+ inter_predictor(((uint8_t *)mc_buf_high) + border_offset, b_w, dst,
+ dst_buf_stride, subpel_x, subpel_y, sf, w, h, ref, kernel,
+ xs, ys);
}
}
#else
@@ -631,7 +628,8 @@ static void dec_build_inter_predictors(
}
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- highbd_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+ highbd_inter_predictor(CONVERT_TO_SHORTPTR(buf_ptr), buf_stride,
+ CONVERT_TO_SHORTPTR(dst), dst_buf->stride, subpel_x,
subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
} else {
inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c
new file mode 100644
index 00000000000..4152e7bb5d5
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vpx_mem/vpx_mem.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+ const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
+ const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+ const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
+ const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
+ vget_low_s64(fedcba98_76543210));
+ const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
+ return sum_diff;
+}
+
+// Denoise a 16x1 vector.
+static INLINE int8x16_t denoiser_16x1_neon(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
+ const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
+ const uint8x16_t v_delta_level_1_and_2,
+ const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+ /* Figure out which level that put us in. */
+ const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
+ const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
+ const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
+
+ /* Calculate absolute adjustments for level 1, 2 and 3. */
+ const uint8x16_t v_level2_adjustment =
+ vandq_u8(v_level2_mask, v_delta_level_1_and_2);
+ const uint8x16_t v_level3_adjustment =
+ vandq_u8(v_level3_mask, v_delta_level_2_and_3);
+ const uint8x16_t v_level1and2_adjustment =
+ vaddq_u8(v_level1_adjustment, v_level2_adjustment);
+ const uint8x16_t v_level1and2and3_adjustment =
+ vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
+
+ /* Figure adjustment absolute value by selecting between the absolute
+ * difference if in level0 or the value for level 1, 2 and 3.
+ */
+ const uint8x16_t v_abs_adjustment =
+ vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
+
+ /* Calculate positive and negative adjustments. Apply them to the signal
+ * and accumulate them. Adjustments are less than eight and the maximum
+ * sum of them (7 * 16) can fit in a signed char.
+ */
+ const uint8x16_t v_pos_adjustment =
+ vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment =
+ vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+ uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ /* Sum all the accumulators to have the sum of all pixel differences
+ * for this macroblock.
+ */
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+ vreinterpretq_s8_u8(v_neg_adjustment));
+ v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+ }
+ return v_sum_diff_total;
+}
+
+static INLINE int8x16_t denoiser_adjust_16x1_neon(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
+ uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+ // Clamp absolute difference to delta to get the adjustment.
+ const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
+
+ const uint8x16_t v_pos_adjustment =
+ vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment =
+ vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+ v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+ vreinterpretq_s8_u8(v_pos_adjustment));
+ v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+ }
+ return v_sum_diff_total;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride, uint8_t *running_avg_y,
+ int avg_y_stride, int increase_denoising,
+ BLOCK_SIZE bs, int motion_magnitude,
+ int width) {
+ int sum_diff_thresh, r, sum_diff = 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+
+ const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+ const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+ const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+ const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
+ const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+ const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+ const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
+
+ int8x16_t v_sum_diff_total = vdupq_n_s8(0);
+
+ for (r = 0; r < b_height; ++r) {
+ memcpy(sig_buffer[r], sig, width);
+ memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+ memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+ memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+ width);
+ memcpy(running_buffer[r], running_avg_y, width);
+ memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+ v_sum_diff_total = denoiser_16x1_neon(
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+ v_level1_threshold, v_level2_threshold, v_level3_threshold,
+ v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
+ v_sum_diff_total);
+ {
+ const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+ const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
+ const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
+ vst1_u8(running_avg_y, v_running_buffer_low);
+ vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+ }
+ // Update pointers for next iteration.
+ sig += (sig_stride << 1);
+ mc_running_avg_y += (mc_avg_y_stride << 1);
+ running_avg_y += (avg_y_stride << 1);
+ }
+
+ {
+ sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+ sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ // Before returning to copy the block (i.e., apply no denoising),
+ // check if we can still apply some (weaker) temporal filtering to
+ // this block, that would otherwise not be denoised at all. Simplest
+ // is to apply an additional adjustment to running_avg_y to bring it
+ // closer to sig. The adjustment is capped by a maximum delta, and
+ // chosen such that in most cases the resulting sum_diff will be
+ // within the acceptable range given by sum_diff_thresh.
+
+ // The delta is set by the excess of absolute pixel diff over the
+ // threshold.
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const uint8x16_t k_delta = vmovq_n_u8(delta);
+ running_avg_y -= avg_y_stride * (b_height << 1);
+ for (r = 0; r < b_height; ++r) {
+ v_sum_diff_total = denoiser_adjust_16x1_neon(
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
+ v_sum_diff_total);
+ {
+ const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+ const uint8x8_t v_running_buffer_high =
+ vget_high_u8(v_running_buffer);
+ const uint8x8_t v_running_buffer_low =
+ vget_low_u8(v_running_buffer);
+ vst1_u8(running_avg_y, v_running_buffer_low);
+ vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+ }
+ // Update pointers for next iteration.
+ running_avg_y += (avg_y_stride << 1);
+ }
+ sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+
+ return FILTER_BLOCK;
+}
+
+// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
+static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride, uint8_t *running_avg_y,
+ int avg_y_stride, int increase_denoising,
+ BLOCK_SIZE bs, int motion_magnitude) {
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+ const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+ const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+ const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+ const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+ const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+ const int b_width = (4 << b_width_log2_lookup[bs]);
+ const int b_height = (4 << b_height_log2_lookup[bs]);
+ const int b_width_shift4 = b_width >> 4;
+
+ int8x16_t v_sum_diff_total[4][4];
+ int r, c, sum_diff = 0;
+
+ for (r = 0; r < 4; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r] = vdupq_n_s8(0);
+ }
+ }
+
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
+ sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
+ v_level2_threshold, v_level3_threshold, v_level1_adjustment,
+ v_delta_level_1_and_2, v_delta_level_2_and_3,
+ v_sum_diff_total[c][r >> 4]);
+
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+ }
+ }
+
+ // Update pointers for next iteration.
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+
+ {
+ const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const uint8x16_t k_delta = vdupq_n_u8(delta);
+ sig -= sig_stride * b_height;
+ mc_running_avg_y -= mc_avg_y_stride * b_height;
+ running_avg_y -= avg_y_stride * b_height;
+ sum_diff = 0;
+
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r >> 4] =
+ denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
+ k_delta, v_sum_diff_total[c][r >> 4]);
+
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+ }
+ }
+
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+ return FILTER_BLOCK;
+}
+
+int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
+ uint8_t *avg, int avg_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude) {
+ // Rank by frequency of the block type to have an early termination.
+ if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+ bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+ bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+ return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+ avg_stride, increase_denoising, bs,
+ motion_magnitude);
+ } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+ return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+ avg_stride, increase_denoising, bs,
+ motion_magnitude, 8);
+ }
+ return COPY_BLOCK;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 98c56407596..0b175969be6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -22,6 +22,7 @@
#include "vp9/encoder/vp9_rd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/vpx_dsp_common.h"
void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
deleted file mode 100644
index 1ab5f36cc59..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vpx_dsp/mips/macros_msa.h"
-
-static void temporal_filter_apply_8size_msa(const uint8_t *frm1_ptr,
- uint32_t stride,
- const uint8_t *frm2_ptr,
- int32_t filt_sth, int32_t filt_wgt,
- uint32_t *acc, uint16_t *cnt) {
- uint32_t row;
- uint64_t f0, f1, f2, f3;
- v16i8 frm2, frm1 = { 0 };
- v16i8 frm4, frm3 = { 0 };
- v16u8 frm_r, frm_l;
- v8i16 frm2_r, frm2_l;
- v8i16 diff0, diff1, mod0_h, mod1_h;
- v4i32 cnst3, cnst16, filt_wt, strength;
- v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
- v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
- v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
- v4i32 acc0, acc1, acc2, acc3;
- v8i16 cnt0, cnt1;
-
- filt_wt = __msa_fill_w(filt_wgt);
- strength = __msa_fill_w(filt_sth);
- cnst3 = __msa_ldi_w(3);
- cnst16 = __msa_ldi_w(16);
-
- for (row = 2; row--;) {
- LD4(frm1_ptr, stride, f0, f1, f2, f3);
- frm1_ptr += (4 * stride);
-
- LD_SB2(frm2_ptr, 16, frm2, frm4);
- frm2_ptr += 32;
-
- LD_SW2(acc, 4, acc0, acc1);
- LD_SW2(acc + 8, 4, acc2, acc3);
- LD_SH2(cnt, 8, cnt0, cnt1);
-
- INSERT_D2_SB(f0, f1, frm1);
- INSERT_D2_SB(f2, f3, frm3);
- ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
- HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
- UNPCK_SH_SW(diff0, diff0_r, diff0_l);
- UNPCK_SH_SW(diff1, diff1_r, diff1_l);
- MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
- mod0_w, mod1_w, mod2_w, mod3_w);
- MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
- mod1_w, mod2_w, mod3_w);
- SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
- diff0_r = (mod0_w < cnst16);
- diff0_l = (mod1_w < cnst16);
- diff1_r = (mod2_w < cnst16);
- diff1_l = (mod3_w < cnst16);
-
- SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
- mod1_w, mod2_w, mod3_w);
-
- mod0_w = diff0_r & mod0_w;
- mod1_w = diff0_l & mod1_w;
- mod2_w = diff1_r & mod2_w;
- mod3_w = diff1_l & mod3_w;
-
- MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
- mod0_w, mod1_w, mod2_w, mod3_w);
- PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
- ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
- ST_SH2(mod0_h, mod1_h, cnt, 8);
- cnt += 16;
-
- UNPCK_UB_SH(frm2, frm2_r, frm2_l);
- UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
- UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
- MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
- mod0_w, mod1_w, mod2_w, mod3_w);
- ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
- mod2_w, mod3_w);
-
- ST_SW2(mod0_w, mod1_w, acc, 4);
- acc += 8;
- ST_SW2(mod2_w, mod3_w, acc, 4);
- acc += 8;
-
- LD_SW2(acc, 4, acc0, acc1);
- LD_SW2(acc + 8, 4, acc2, acc3);
- LD_SH2(cnt, 8, cnt0, cnt1);
-
- ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
- HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
- UNPCK_SH_SW(diff0, diff0_r, diff0_l);
- UNPCK_SH_SW(diff1, diff1_r, diff1_l);
- MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
- mod0_w, mod1_w, mod2_w, mod3_w);
- MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
- mod1_w, mod2_w, mod3_w);
- SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
- diff0_r = (mod0_w < cnst16);
- diff0_l = (mod1_w < cnst16);
- diff1_r = (mod2_w < cnst16);
- diff1_l = (mod3_w < cnst16);
-
- SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
- mod1_w, mod2_w, mod3_w);
-
- mod0_w = diff0_r & mod0_w;
- mod1_w = diff0_l & mod1_w;
- mod2_w = diff1_r & mod2_w;
- mod3_w = diff1_l & mod3_w;
-
- MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
- mod0_w, mod1_w, mod2_w, mod3_w);
- PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
- ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
- ST_SH2(mod0_h, mod1_h, cnt, 8);
- cnt += 16;
- UNPCK_UB_SH(frm4, frm2_r, frm2_l);
- UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
- UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
- MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
- mod0_w, mod1_w, mod2_w, mod3_w);
- ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
- mod2_w, mod3_w);
-
- ST_SW2(mod0_w, mod1_w, acc, 4);
- acc += 8;
- ST_SW2(mod2_w, mod3_w, acc, 4);
- acc += 8;
- }
-}
-
-static void temporal_filter_apply_16size_msa(const uint8_t *frm1_ptr,
- uint32_t stride,
- const uint8_t *frm2_ptr,
- int32_t filt_sth, int32_t filt_wgt,
- uint32_t *acc, uint16_t *cnt) {
- uint32_t row;
- v16i8 frm1, frm2, frm3, frm4;
- v16u8 frm_r, frm_l;
- v16i8 zero = { 0 };
- v8u16 frm2_r, frm2_l;
- v8i16 diff0, diff1, mod0_h, mod1_h;
- v4i32 cnst3, cnst16, filt_wt, strength;
- v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
- v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
- v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
- v4i32 acc0, acc1, acc2, acc3;
- v8i16 cnt0, cnt1;
-
- filt_wt = __msa_fill_w(filt_wgt);
- strength = __msa_fill_w(filt_sth);
- cnst3 = __msa_ldi_w(3);
- cnst16 = __msa_ldi_w(16);
-
- for (row = 8; row--;) {
- LD_SB2(frm1_ptr, stride, frm1, frm3);
- frm1_ptr += stride;
-
- LD_SB2(frm2_ptr, 16, frm2, frm4);
- frm2_ptr += 16;
-
- LD_SW2(acc, 4, acc0, acc1);
- LD_SW2(acc, 4, acc2, acc3);
- LD_SH2(cnt, 8, cnt0, cnt1);
-
- ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
- HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
- UNPCK_SH_SW(diff0, diff0_r, diff0_l);
- UNPCK_SH_SW(diff1, diff1_r, diff1_l);
- MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
- mod0_w, mod1_w, mod2_w, mod3_w);
- MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
- mod1_w, mod2_w, mod3_w);
- SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
- diff0_r = (mod0_w < cnst16);
- diff0_l = (mod1_w < cnst16);
- diff1_r = (mod2_w < cnst16);
- diff1_l = (mod3_w < cnst16);
-
- SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
- mod1_w, mod2_w, mod3_w);
-
- mod0_w = diff0_r & mod0_w;
- mod1_w = diff0_l & mod1_w;
- mod2_w = diff1_r & mod2_w;
- mod3_w = diff1_l & mod3_w;
-
- MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
- mod0_w, mod1_w, mod2_w, mod3_w);
- PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
- ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
- ST_SH2(mod0_h, mod1_h, cnt, 8);
- cnt += 16;
-
- ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
- UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
- UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
- MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
- mod0_w, mod1_w, mod2_w, mod3_w);
- ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
- mod2_w, mod3_w);
-
- ST_SW2(mod0_w, mod1_w, acc, 4);
- acc += 8;
- ST_SW2(mod2_w, mod3_w, acc, 4);
- acc += 8;
-
- LD_SW2(acc, 4, acc0, acc1);
- LD_SW2(acc + 8, 4, acc2, acc3);
- LD_SH2(cnt, 8, cnt0, cnt1);
-
- ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
- HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
- UNPCK_SH_SW(diff0, diff0_r, diff0_l);
- UNPCK_SH_SW(diff1, diff1_r, diff1_l);
- MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
- mod0_w, mod1_w, mod2_w, mod3_w);
- MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
- mod1_w, mod2_w, mod3_w);
- SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
- diff0_r = (mod0_w < cnst16);
- diff0_l = (mod1_w < cnst16);
- diff1_r = (mod2_w < cnst16);
- diff1_l = (mod3_w < cnst16);
-
- SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
- mod1_w, mod2_w, mod3_w);
-
- mod0_w = diff0_r & mod0_w;
- mod1_w = diff0_l & mod1_w;
- mod2_w = diff1_r & mod2_w;
- mod3_w = diff1_l & mod3_w;
-
- MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
- mod0_w, mod1_w, mod2_w, mod3_w);
- PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
- ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
- ST_SH2(mod0_h, mod1_h, cnt, 8);
- cnt += 16;
-
- ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
- UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
- UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
- MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
- mod0_w, mod1_w, mod2_w, mod3_w);
- ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
- mod2_w, mod3_w);
- ST_SW2(mod0_w, mod1_w, acc, 4);
- acc += 8;
- ST_SW2(mod2_w, mod3_w, acc, 4);
- acc += 8;
-
- frm1_ptr += stride;
- frm2_ptr += 16;
- }
-}
-
-void vp9_temporal_filter_apply_msa(const uint8_t *frame1_ptr, uint32_t stride,
- const uint8_t *frame2_ptr, uint32_t blk_w,
- uint32_t blk_h, int32_t strength,
- int32_t filt_wgt, uint32_t *accu,
- uint16_t *cnt) {
- if (8 == (blk_w * blk_h)) {
- temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
- filt_wgt, accu, cnt);
- } else if (16 == (blk_w * blk_h)) {
- temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
- filt_wgt, accu, cnt);
- } else {
- vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
- strength, filt_wgt, accu, cnt);
- }
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index b4a0bbe58bd..048ea629f5a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -277,8 +277,6 @@ void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) {
!cpi->oxcf.gf_cbr_boost_pct) {
// Force this frame as a golden update frame if this frame changes the
// resolution (resize_pending != 0).
- // TODO(marpan): check on forcing golden update if the background has very
- // high motion in current frame.
if (cpi->resize_pending != 0) {
vp9_cyclic_refresh_set_golden_update(cpi);
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
@@ -316,6 +314,8 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
else
rc->baseline_gf_interval = 40;
if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20;
+ if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40)
+ rc->baseline_gf_interval = 10;
}
// Update the segmentation map, and related quantities: cyclic refresh map,
@@ -425,6 +425,13 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
int target_refresh = 0;
double weight_segment_target = 0;
double weight_segment = 0;
+ cr->apply_cyclic_refresh = 1;
+ if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 ||
+ (!cpi->use_svc && rc->avg_frame_low_motion < 55 &&
+ rc->frames_since_key > 40)) {
+ cr->apply_cyclic_refresh = 0;
+ return;
+ }
cr->percent_refresh = 10;
if (cr->reduce_refresh) cr->percent_refresh = 5;
cr->max_qdelta_perc = 60;
@@ -493,14 +500,8 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
const RATE_CONTROL *const rc = &cpi->rc;
CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
struct segmentation *const seg = &cm->seg;
- // TODO(marpan): Look into whether we should reduce the amount/delta-qp
- // instead of completely shutting off at low bitrates. For now keep it on.
- // const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
- const int apply_cyclic_refresh = 1;
if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
- // Don't apply refresh on key frame or temporal enhancement layer frames.
- if (!apply_cyclic_refresh || (cm->frame_type == KEY_FRAME) ||
- (cpi->force_update_segmentation) || (cpi->svc.temporal_layer_id > 0)) {
+ if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation)) {
// Set segmentation map to 0 and disable.
unsigned char *const seg_map = cpi->segmentation_map;
memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
index 9de5074d9ec..77fa67c9e16 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -67,6 +67,7 @@ struct CYCLIC_REFRESH {
int qindex_delta[3];
int reduce_refresh;
double weight_segment;
+ int apply_cyclic_refresh;
};
struct VP9_COMP;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
index 42dc6830d6c..ab488f48f0a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
@@ -93,11 +93,6 @@ struct macroblock {
int rddiv;
int rdmult;
int mb_energy;
- int *m_search_count_ptr;
- int *ex_search_count_ptr;
-#if CONFIG_MULTITHREAD
- pthread_mutex_t *search_count_mutex;
-#endif
// These are set to their default values at the beginning, and then adjusted
// further in the encoding process.
@@ -173,6 +168,8 @@ struct macroblock {
uint8_t skip_low_source_sad;
+ uint8_t lowvar_highsumdiff;
+
uint8_t last_sb_high_content;
// For each superblock: saves the content value (e.g., low/high sad/sumdiff)
@@ -187,7 +184,7 @@ struct macroblock {
void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
#if CONFIG_VP9_HIGHBITDEPTH
- void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+ void (*highbd_itxm_add)(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
#endif
};
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c
index b92557a9c40..e6933f00d8b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c
@@ -191,7 +191,9 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv,
int num_spatial_layers, int width) {
- int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
+ const int sse_diff = (ctx->newmv_sse == UINT_MAX)
+ ? 0
+ : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
MV_REFERENCE_FRAME frame;
MACROBLOCKD *filter_mbd = &mb->e_mbd;
MODE_INFO *mi = filter_mbd->mi[0];
@@ -217,7 +219,6 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
// difference in sum-squared-error, use it.
if (frame != INTRA_FRAME &&
(frame != GOLDEN_FRAME || num_spatial_layers == 1) &&
- ctx->newmv_sse != UINT_MAX &&
sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
mi->ref_frame[0] = ctx->best_reference_frame;
mi->mode = ctx->best_sse_inter_mode;
@@ -571,20 +572,26 @@ void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) {
// Scale/increase the partition threshold for denoiser speed-up.
int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
- int content_state) {
+ int content_state, int temporal_layer_id) {
if ((content_state == kLowSadLowSumdiff) ||
- (content_state == kHighSadLowSumdiff) || noise_level == kDenHigh)
- return (3 * threshold) >> 1;
- else
+ (content_state == kHighSadLowSumdiff) ||
+ (content_state == kLowVarHighSumdiff) || (noise_level == kDenHigh) ||
+ (temporal_layer_id != 0)) {
+ int64_t scaled_thr =
+ (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2;
+ return scaled_thr;
+ } else {
return (5 * threshold) >> 2;
+ }
}
// Scale/increase the ac skip threshold for denoiser speed-up.
int64_t vp9_scale_acskip_thresh(int64_t threshold,
- VP9_DENOISER_LEVEL noise_level,
- int abs_sumdiff) {
+ VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
+ int temporal_layer_id) {
if (noise_level >= kDenLow && abs_sumdiff < 5)
- return threshold *= (noise_level == kDenLow) ? 2 : 6;
+ return threshold *=
+ (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6;
else
return threshold;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h
index 9bded21769d..f0845e113c0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h
@@ -95,11 +95,11 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser);
void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level);
int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
- int content_state);
+ int content_state, int temporal_layer_id);
int64_t vp9_scale_acskip_thresh(int64_t threshold,
- VP9_DENOISER_LEVEL noise_level,
- int abs_sumdiff);
+ VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
+ int temporal_layer_id);
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
index 481f5a0fdac..6215e198ca6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -495,11 +495,13 @@ int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, int width,
if (width <= 640 && height <= 480)
return (5 * threshold_base) >> 2;
else if ((content_state == kLowSadLowSumdiff) ||
- (content_state == kHighSadLowSumdiff))
+ (content_state == kHighSadLowSumdiff) ||
+ (content_state == kLowVarHighSumdiff))
return (5 * threshold_base) >> 2;
} else if (speed == 7) {
if ((content_state == kLowSadLowSumdiff) ||
- (content_state == kHighSadLowSumdiff)) {
+ (content_state == kHighSadLowSumdiff) ||
+ (content_state == kLowVarHighSumdiff)) {
return (5 * threshold_base) >> 2;
}
}
@@ -536,10 +538,11 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q,
threshold_base = (7 * threshold_base) >> 3;
}
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0 && cpi->oxcf.speed > 5 &&
- cpi->denoiser.denoising_level >= kDenLow)
- threshold_base = vp9_scale_part_thresh(
- threshold_base, cpi->denoiser.denoising_level, content_state);
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow)
+ threshold_base =
+ vp9_scale_part_thresh(threshold_base, cpi->denoiser.denoising_level,
+ content_state, cpi->svc.temporal_layer_id);
else
threshold_base =
scale_part_thresh_sumdiff(threshold_base, cpi->oxcf.speed, cm->width,
@@ -838,7 +841,8 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
}
}
-static void copy_partitioning_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
+static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x,
+ MACROBLOCKD *xd, BLOCK_SIZE bsize,
int mi_row, int mi_col) {
VP9_COMMON *const cm = &cpi->common;
BLOCK_SIZE *prev_part = cpi->prev_partition;
@@ -848,49 +852,61 @@ static void copy_partitioning_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
const int bs = (1 << bsl) / 4;
BLOCK_SIZE subsize;
PARTITION_TYPE partition;
- MODE_INFO *mi = NULL;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
partition = partition_lookup[bsl][prev_part[start_pos]];
subsize = get_subsize(bsize, partition);
- mi = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
if (subsize < BLOCK_8X8) {
- mi->sb_type = bsize;
+ set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
} else {
switch (partition) {
- case PARTITION_NONE: mi->sb_type = bsize; break;
+ case PARTITION_NONE:
+ set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+ break;
case PARTITION_HORZ:
- mi->sb_type = subsize;
- if (mi_row + bs < cm->mi_rows)
- cm->mi_grid_visible[(mi_row + bs) * cm->mi_stride + mi_col]->sb_type =
- subsize;
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row + bs, mi_col, subsize);
break;
case PARTITION_VERT:
- mi->sb_type = subsize;
- if (mi_col + bs < cm->mi_cols)
- cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col + bs]->sb_type =
- subsize;
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize);
break;
case PARTITION_SPLIT:
- copy_partitioning_helper(cpi, subsize, mi_row, mi_col);
- copy_partitioning_helper(cpi, subsize, mi_row + bs, mi_col);
- copy_partitioning_helper(cpi, subsize, mi_row, mi_col + bs);
- copy_partitioning_helper(cpi, subsize, mi_row + bs, mi_col + bs);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs);
break;
default: assert(0);
}
}
}
-static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
- int mi_col, int segment_id, int sb_offset) {
- if (cpi->rc.frames_since_key > 1 && segment_id == CR_SEGMENT_ID_BASE &&
+static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ int mi_row, int mi_col, int segment_id,
+ int sb_offset) {
+ int svc_copy_allowed = 1;
+ int frames_since_key_thresh = 1;
+ if (cpi->use_svc) {
+ // For SVC, don't allow copy if base spatial layer is key frame, or if
+ // frame is not a temporal enhancement layer frame.
+ int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id,
+ cpi->svc.number_temporal_layers);
+ const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+ if (lc->is_key_frame ||
+ (cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1 &&
+ cpi->svc.number_temporal_layers > 1))
+ svc_copy_allowed = 0;
+ frames_since_key_thresh = cpi->svc.number_spatial_layers << 1;
+ }
+ if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed &&
+ !cpi->resize_pending && segment_id == CR_SEGMENT_ID_BASE &&
cpi->prev_segment_id[sb_offset] == CR_SEGMENT_ID_BASE &&
cpi->copied_frame_cnt[sb_offset] < cpi->max_copied_frame) {
if (cpi->prev_partition != NULL) {
- copy_partitioning_helper(cpi, BLOCK_64X64, mi_row, mi_col);
+ copy_partitioning_helper(cpi, x, xd, BLOCK_64X64, mi_row, mi_col);
cpi->copied_frame_cnt[sb_offset] += 1;
memcpy(x->variance_low, &(cpi->prev_variance_low[sb_offset * 25]),
sizeof(x->variance_low));
@@ -946,9 +962,16 @@ static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
unsigned int y_sad, int is_key_frame) {
int i;
MACROBLOCKD *xd = &x->e_mbd;
+
+ if (is_key_frame) return;
+
// For speed >= 8, avoid the chroma check if y_sad is above threshold.
- if (is_key_frame || (cpi->oxcf.speed >= 8 && y_sad > cpi->vbp_thresholds[1]))
- return;
+ if (cpi->oxcf.speed >= 8) {
+ if (y_sad > cpi->vbp_thresholds[1] &&
+ (!cpi->noise_estimate.enabled ||
+ vp9_noise_estimate_extract_level(&cpi->noise_estimate) < kMedium))
+ return;
+ }
for (i = 1; i <= 2; ++i) {
unsigned int uv_sad = UINT_MAX;
@@ -994,6 +1017,11 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
else
x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kHighSadLowSumdiff
: kHighSadHighSumdiff;
+
+ // Detect large lighting change.
+ if (tmp_variance < (tmp_sse >> 3) && (tmp_sse - tmp_variance) > 10000)
+ x->content_state_sb = kLowVarHighSumdiff;
+
if (cpi->content_state_sb_fd != NULL) {
if (tmp_sad < avg_source_sad_threshold2) {
// Cap the increment to 255.
@@ -1061,11 +1089,12 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
content_state == kLowSadHighSumdiff)
? 1
: 0;
+ x->lowvar_highsumdiff = (content_state == kLowVarHighSumdiff) ? 1 : 0;
if (cpi->content_state_sb_fd != NULL)
x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2];
// If source_sad is low copy the partition without computing the y_sad.
if (x->skip_low_source_sad && cpi->sf.copy_partition_flag &&
- copy_partitioning(cpi, x, mi_row, mi_col, segment_id, sb_offset)) {
+ copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
return 0;
}
}
@@ -1192,7 +1221,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
// Stop the copy every cpi->max_copied_frame to refresh the partition.
// TODO(jianj) : tune the threshold.
if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy &&
- copy_partitioning(cpi, x, mi_row, mi_col, segment_id, sb_offset)) {
+ copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
chroma_check(cpi, x, bsize, y_sad, is_key_frame);
return 0;
}
@@ -4110,6 +4139,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
x->color_sensitivity[1] = 0;
x->sb_is_skin = 0;
x->skip_low_source_sad = 0;
+ x->lowvar_highsumdiff = 0;
x->content_state_sb = 0;
if (seg->enabled) {
@@ -4341,8 +4371,6 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
}
}
#if CONFIG_MULTITHREAD
- tile_data->search_count_mutex = NULL;
- tile_data->enc_row_mt_mutex = NULL;
tile_data->row_base_thresh_freq_fact = NULL;
#endif
}
@@ -4361,10 +4389,6 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
tplist = cpi->tplist[tile_row][tile_col];
tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
-
- // Set up pointers to per thread motion search counters.
- this_tile->m_search_count = 0; // Count of motion search hits.
- this_tile->ex_search_count = 0; // Exhaustive mesh search hits.
}
}
}
@@ -4409,13 +4433,6 @@ void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row,
const int mi_row_end = tile_info->mi_row_end;
int mi_row;
- // Set up pointers to per thread motion search counters.
- td->mb.m_search_count_ptr = &this_tile->m_search_count;
- td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
-#if CONFIG_MULTITHREAD
- td->mb.search_count_mutex = this_tile->search_count_mutex;
-#endif
-
for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE)
vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
index 0940d9a6153..7e30499c573 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
@@ -637,24 +637,25 @@ static void encode_block(int plane, int block, int row, int col,
if (x->skip_encode || p->eobs[block] == 0) return;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
switch (tx_size) {
case TX_32X32:
- vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+ vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
xd->bd);
break;
case TX_16X16:
- vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+ vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
xd->bd);
break;
case TX_8X8:
- vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+ vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
xd->bd);
break;
case TX_4X4:
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
- x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+ x->highbd_itxm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
xd->bd);
break;
default: assert(0 && "Invalid transform size");
@@ -699,7 +700,8 @@ static void encode_block_pass1(int plane, int block, int row, int col,
if (p->eobs[block] > 0) {
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);
+ x->highbd_itxm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,
+ p->eobs[block], xd->bd);
return;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -799,6 +801,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
switch (tx_size) {
case TX_32X32:
if (!x->skip_recode) {
@@ -810,8 +813,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
}
+ if (args->enable_coeff_opt && !x->skip_recode) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
if (!x->skip_encode && *eob) {
- vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+ vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
}
break;
case TX_16X16:
@@ -827,8 +833,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
+ if (args->enable_coeff_opt && !x->skip_recode) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
if (!x->skip_encode && *eob) {
- vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+ vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob,
xd->bd);
}
break;
@@ -845,8 +854,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
+ if (args->enable_coeff_opt && !x->skip_recode) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
if (!x->skip_encode && *eob) {
- vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+ vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob,
xd->bd);
}
break;
@@ -863,15 +875,18 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
pd->dequant, eob, scan_order->scan,
scan_order->iscan);
}
-
+ if (args->enable_coeff_opt && !x->skip_recode) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
if (!x->skip_encode && *eob) {
if (tx_type == DCT_DCT) {
// this is like vp9_short_idct4x4 but has a special case around
// eob<=1 which is significant (not just an optimization) for the
// lossless case.
- x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+ x->highbd_itxm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
} else {
- vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);
+ vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type,
+ xd->bd);
}
}
break;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
index 1dc70d2d361..f57f40dbe4c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
@@ -73,6 +73,9 @@
// chosen.
// #define OUTPUT_YUV_REC
+#define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold
+#define FRAME_RATE_FACTOR 8
+
#ifdef OUTPUT_YUV_DENOISED
FILE *yuv_denoised_file = NULL;
#endif
@@ -100,6 +103,331 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) {
}
#endif
+// compute adaptive threshold for skip recoding
+static int compute_context_model_thresh(const VP9_COMP *const cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const int frame_size = (cm->width * cm->height) >> 10;
+ const int bitrate = (int)(oxcf->target_bandwidth >> 10);
+ const int qindex_factor = cm->base_qindex + (MAXQ >> 1);
+
+ // This equation makes the threshold adaptive to frame size.
+ // Coding gain obtained by recoding comes from alternate frames of large
+ // content change. We skip recoding if the difference of previous and current
+ // frame context probability model is less than a certain threshold.
+ // The first component is the most critical part to guarantee adaptivity.
+ // Other parameters are estimated based on normal setting of hd resolution
+ // parameters. e.g frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
+ const int thresh =
+ ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) *
+ qindex_factor) >>
+ 9;
+
+ return thresh;
+}
+
+// compute the total cost difference between current
+// and previous frame context prob model.
+static int compute_context_model_diff(const VP9_COMMON *const cm) {
+ const FRAME_CONTEXT *const pre_fc =
+ &cm->frame_contexts[cm->frame_context_idx];
+ const FRAME_CONTEXT *const cur_fc = cm->fc;
+ const FRAME_COUNTS *counts = &cm->counts;
+ vpx_prob pre_last_prob, cur_last_prob;
+ int diff = 0;
+ int i, j, k, l, m, n;
+
+ // y_mode_prob
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+ for (j = 0; j < INTRA_MODES - 1; ++j) {
+ diff += (int)counts->y_mode[i][j] *
+ (pre_fc->y_mode_prob[i][j] - cur_fc->y_mode_prob[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->y_mode_prob[i][INTRA_MODES - 2];
+ cur_last_prob = MAX_PROB - cur_fc->y_mode_prob[i][INTRA_MODES - 2];
+
+ diff += (int)counts->y_mode[i][INTRA_MODES - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // uv_mode_prob
+ for (i = 0; i < INTRA_MODES; ++i) {
+ for (j = 0; j < INTRA_MODES - 1; ++j) {
+ diff += (int)counts->uv_mode[i][j] *
+ (pre_fc->uv_mode_prob[i][j] - cur_fc->uv_mode_prob[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->uv_mode_prob[i][INTRA_MODES - 2];
+ cur_last_prob = MAX_PROB - cur_fc->uv_mode_prob[i][INTRA_MODES - 2];
+
+ diff += (int)counts->uv_mode[i][INTRA_MODES - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // partition_prob
+ for (i = 0; i < PARTITION_CONTEXTS; ++i) {
+ for (j = 0; j < PARTITION_TYPES - 1; ++j) {
+ diff += (int)counts->partition[i][j] *
+ (pre_fc->partition_prob[i][j] - cur_fc->partition_prob[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->partition_prob[i][PARTITION_TYPES - 2];
+ cur_last_prob = MAX_PROB - cur_fc->partition_prob[i][PARTITION_TYPES - 2];
+
+ diff += (int)counts->partition[i][PARTITION_TYPES - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // coef_probs
+ for (i = 0; i < TX_SIZES; ++i) {
+ for (j = 0; j < PLANE_TYPES; ++j) {
+ for (k = 0; k < REF_TYPES; ++k) {
+ for (l = 0; l < COEF_BANDS; ++l) {
+ for (m = 0; m < BAND_COEFF_CONTEXTS(l); ++m) {
+ for (n = 0; n < UNCONSTRAINED_NODES; ++n) {
+ diff += (int)counts->coef[i][j][k][l][m][n] *
+ (pre_fc->coef_probs[i][j][k][l][m][n] -
+ cur_fc->coef_probs[i][j][k][l][m][n]);
+ }
+
+ pre_last_prob =
+ MAX_PROB -
+ pre_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1];
+ cur_last_prob =
+ MAX_PROB -
+ cur_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1];
+
+ diff += (int)counts->coef[i][j][k][l][m][UNCONSTRAINED_NODES] *
+ (pre_last_prob - cur_last_prob);
+ }
+ }
+ }
+ }
+ }
+
+ // switchable_interp_prob
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
+ for (j = 0; j < SWITCHABLE_FILTERS - 1; ++j) {
+ diff += (int)counts->switchable_interp[i][j] *
+ (pre_fc->switchable_interp_prob[i][j] -
+ cur_fc->switchable_interp_prob[i][j]);
+ }
+ pre_last_prob =
+ MAX_PROB - pre_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2];
+ cur_last_prob =
+ MAX_PROB - cur_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2];
+
+ diff += (int)counts->switchable_interp[i][SWITCHABLE_FILTERS - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // inter_mode_probs
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+ for (j = 0; j < INTER_MODES - 1; ++j) {
+ diff += (int)counts->inter_mode[i][j] *
+ (pre_fc->inter_mode_probs[i][j] - cur_fc->inter_mode_probs[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->inter_mode_probs[i][INTER_MODES - 2];
+ cur_last_prob = MAX_PROB - cur_fc->inter_mode_probs[i][INTER_MODES - 2];
+
+ diff += (int)counts->inter_mode[i][INTER_MODES - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // intra_inter_prob
+ for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+ diff += (int)counts->intra_inter[i][0] *
+ (pre_fc->intra_inter_prob[i] - cur_fc->intra_inter_prob[i]);
+
+ pre_last_prob = MAX_PROB - pre_fc->intra_inter_prob[i];
+ cur_last_prob = MAX_PROB - cur_fc->intra_inter_prob[i];
+
+ diff += (int)counts->intra_inter[i][1] * (pre_last_prob - cur_last_prob);
+ }
+
+ // comp_inter_prob
+ for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+ diff += (int)counts->comp_inter[i][0] *
+ (pre_fc->comp_inter_prob[i] - cur_fc->comp_inter_prob[i]);
+
+ pre_last_prob = MAX_PROB - pre_fc->comp_inter_prob[i];
+ cur_last_prob = MAX_PROB - cur_fc->comp_inter_prob[i];
+
+ diff += (int)counts->comp_inter[i][1] * (pre_last_prob - cur_last_prob);
+ }
+
+ // single_ref_prob
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ for (j = 0; j < 2; ++j) {
+ diff += (int)counts->single_ref[i][j][0] *
+ (pre_fc->single_ref_prob[i][j] - cur_fc->single_ref_prob[i][j]);
+
+ pre_last_prob = MAX_PROB - pre_fc->single_ref_prob[i][j];
+ cur_last_prob = MAX_PROB - cur_fc->single_ref_prob[i][j];
+
+ diff +=
+ (int)counts->single_ref[i][j][1] * (pre_last_prob - cur_last_prob);
+ }
+ }
+
+ // comp_ref_prob
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ diff += (int)counts->comp_ref[i][0] *
+ (pre_fc->comp_ref_prob[i] - cur_fc->comp_ref_prob[i]);
+
+ pre_last_prob = MAX_PROB - pre_fc->comp_ref_prob[i];
+ cur_last_prob = MAX_PROB - cur_fc->comp_ref_prob[i];
+
+ diff += (int)counts->comp_ref[i][1] * (pre_last_prob - cur_last_prob);
+ }
+
+ // tx_probs
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+ // p32x32
+ for (j = 0; j < TX_SIZES - 1; ++j) {
+ diff += (int)counts->tx.p32x32[i][j] *
+ (pre_fc->tx_probs.p32x32[i][j] - cur_fc->tx_probs.p32x32[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->tx_probs.p32x32[i][TX_SIZES - 2];
+ cur_last_prob = MAX_PROB - cur_fc->tx_probs.p32x32[i][TX_SIZES - 2];
+
+ diff += (int)counts->tx.p32x32[i][TX_SIZES - 1] *
+ (pre_last_prob - cur_last_prob);
+
+ // p16x16
+ for (j = 0; j < TX_SIZES - 2; ++j) {
+ diff += (int)counts->tx.p16x16[i][j] *
+ (pre_fc->tx_probs.p16x16[i][j] - cur_fc->tx_probs.p16x16[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->tx_probs.p16x16[i][TX_SIZES - 3];
+ cur_last_prob = MAX_PROB - cur_fc->tx_probs.p16x16[i][TX_SIZES - 3];
+
+ diff += (int)counts->tx.p16x16[i][TX_SIZES - 2] *
+ (pre_last_prob - cur_last_prob);
+
+ // p8x8
+ for (j = 0; j < TX_SIZES - 3; ++j) {
+ diff += (int)counts->tx.p8x8[i][j] *
+ (pre_fc->tx_probs.p8x8[i][j] - cur_fc->tx_probs.p8x8[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->tx_probs.p8x8[i][TX_SIZES - 4];
+ cur_last_prob = MAX_PROB - cur_fc->tx_probs.p8x8[i][TX_SIZES - 4];
+
+ diff +=
+ (int)counts->tx.p8x8[i][TX_SIZES - 3] * (pre_last_prob - cur_last_prob);
+ }
+
+ // skip_probs
+ for (i = 0; i < SKIP_CONTEXTS; ++i) {
+ diff += (int)counts->skip[i][0] *
+ (pre_fc->skip_probs[i] - cur_fc->skip_probs[i]);
+
+ pre_last_prob = MAX_PROB - pre_fc->skip_probs[i];
+ cur_last_prob = MAX_PROB - cur_fc->skip_probs[i];
+
+ diff += (int)counts->skip[i][1] * (pre_last_prob - cur_last_prob);
+ }
+
+ // mv
+ for (i = 0; i < MV_JOINTS - 1; ++i) {
+ diff += (int)counts->mv.joints[i] *
+ (pre_fc->nmvc.joints[i] - cur_fc->nmvc.joints[i]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->nmvc.joints[MV_JOINTS - 2];
+ cur_last_prob = MAX_PROB - cur_fc->nmvc.joints[MV_JOINTS - 2];
+
+ diff +=
+ (int)counts->mv.joints[MV_JOINTS - 1] * (pre_last_prob - cur_last_prob);
+
+ for (i = 0; i < 2; ++i) {
+ const nmv_component_counts *nmv_count = &counts->mv.comps[i];
+ const nmv_component *pre_nmv_prob = &pre_fc->nmvc.comps[i];
+ const nmv_component *cur_nmv_prob = &cur_fc->nmvc.comps[i];
+
+ // sign
+ diff += (int)nmv_count->sign[0] * (pre_nmv_prob->sign - cur_nmv_prob->sign);
+
+ pre_last_prob = MAX_PROB - pre_nmv_prob->sign;
+ cur_last_prob = MAX_PROB - cur_nmv_prob->sign;
+
+ diff += (int)nmv_count->sign[1] * (pre_last_prob - cur_last_prob);
+
+ // classes
+ for (j = 0; j < MV_CLASSES - 1; ++j) {
+ diff += (int)nmv_count->classes[j] *
+ (pre_nmv_prob->classes[j] - cur_nmv_prob->classes[j]);
+ }
+ pre_last_prob = MAX_PROB - pre_nmv_prob->classes[MV_CLASSES - 2];
+ cur_last_prob = MAX_PROB - cur_nmv_prob->classes[MV_CLASSES - 2];
+
+ diff += (int)nmv_count->classes[MV_CLASSES - 1] *
+ (pre_last_prob - cur_last_prob);
+
+ // class0
+ for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+ diff += (int)nmv_count->class0[j] *
+ (pre_nmv_prob->class0[j] - cur_nmv_prob->class0[j]);
+ }
+ pre_last_prob = MAX_PROB - pre_nmv_prob->class0[CLASS0_SIZE - 2];
+ cur_last_prob = MAX_PROB - cur_nmv_prob->class0[CLASS0_SIZE - 2];
+
+ diff += (int)nmv_count->class0[CLASS0_SIZE - 1] *
+ (pre_last_prob - cur_last_prob);
+
+ // bits
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ diff += (int)nmv_count->bits[j][0] *
+ (pre_nmv_prob->bits[j] - cur_nmv_prob->bits[j]);
+
+ pre_last_prob = MAX_PROB - pre_nmv_prob->bits[j];
+ cur_last_prob = MAX_PROB - cur_nmv_prob->bits[j];
+
+ diff += (int)nmv_count->bits[j][1] * (pre_last_prob - cur_last_prob);
+ }
+
+ // class0_fp
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ for (k = 0; k < MV_FP_SIZE - 1; ++k) {
+ diff += (int)nmv_count->class0_fp[j][k] *
+ (pre_nmv_prob->class0_fp[j][k] - cur_nmv_prob->class0_fp[j][k]);
+ }
+ pre_last_prob = MAX_PROB - pre_nmv_prob->class0_fp[j][MV_FP_SIZE - 2];
+ cur_last_prob = MAX_PROB - cur_nmv_prob->class0_fp[j][MV_FP_SIZE - 2];
+
+ diff += (int)nmv_count->class0_fp[j][MV_FP_SIZE - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // fp
+ for (j = 0; j < MV_FP_SIZE - 1; ++j) {
+ diff +=
+ (int)nmv_count->fp[j] * (pre_nmv_prob->fp[j] - cur_nmv_prob->fp[j]);
+ }
+ pre_last_prob = MAX_PROB - pre_nmv_prob->fp[MV_FP_SIZE - 2];
+ cur_last_prob = MAX_PROB - cur_nmv_prob->fp[MV_FP_SIZE - 2];
+
+ diff +=
+ (int)nmv_count->fp[MV_FP_SIZE - 1] * (pre_last_prob - cur_last_prob);
+
+ // class0_hp
+ diff += (int)nmv_count->class0_hp[0] *
+ (pre_nmv_prob->class0_hp - cur_nmv_prob->class0_hp);
+
+ pre_last_prob = MAX_PROB - pre_nmv_prob->class0_hp;
+ cur_last_prob = MAX_PROB - cur_nmv_prob->class0_hp;
+
+ diff += (int)nmv_count->class0_hp[1] * (pre_last_prob - cur_last_prob);
+
+ // hp
+ diff += (int)nmv_count->hp[0] * (pre_nmv_prob->hp - cur_nmv_prob->hp);
+
+ pre_last_prob = MAX_PROB - pre_nmv_prob->hp;
+ cur_last_prob = MAX_PROB - cur_nmv_prob->hp;
+
+ diff += (int)nmv_count->hp[1] * (pre_last_prob - cur_last_prob);
+ }
+
+ return -diff;
+}
+
// Test for whether to calculate metrics for the frame.
static int is_psnr_calc_enabled(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
@@ -110,22 +438,22 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) {
/* clang-format off */
const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
- { LEVEL_1, 829440, 36864, 200, 400, 2, 1, 4, 8 },
- { LEVEL_1_1, 2764800, 73728, 800, 1000, 2, 1, 4, 8 },
- { LEVEL_2, 4608000, 122880, 1800, 1500, 2, 1, 4, 8 },
- { LEVEL_2_1, 9216000, 245760, 3600, 2800, 2, 2, 4, 8 },
- { LEVEL_3, 20736000, 552960, 7200, 6000, 2, 4, 4, 8 },
- { LEVEL_3_1, 36864000, 983040, 12000, 10000, 2, 4, 4, 8 },
- { LEVEL_4, 83558400, 2228224, 18000, 16000, 4, 4, 4, 8 },
- { LEVEL_4_1, 160432128, 2228224, 30000, 18000, 4, 4, 5, 6 },
- { LEVEL_5, 311951360, 8912896, 60000, 36000, 6, 8, 6, 4 },
- { LEVEL_5_1, 588251136, 8912896, 120000, 46000, 8, 8, 10, 4 },
+ { LEVEL_1, 829440, 36864, 200, 400, 2, 1, 4, 8 },
+ { LEVEL_1_1, 2764800, 73728, 800, 1000, 2, 1, 4, 8 },
+ { LEVEL_2, 4608000, 122880, 1800, 1500, 2, 1, 4, 8 },
+ { LEVEL_2_1, 9216000, 245760, 3600, 2800, 2, 2, 4, 8 },
+ { LEVEL_3, 20736000, 552960, 7200, 6000, 2, 4, 4, 8 },
+ { LEVEL_3_1, 36864000, 983040, 12000, 10000, 2, 4, 4, 8 },
+ { LEVEL_4, 83558400, 2228224, 18000, 16000, 4, 4, 4, 8 },
+ { LEVEL_4_1, 160432128, 2228224, 30000, 18000, 4, 4, 5, 6 },
+ { LEVEL_5, 311951360, 8912896, 60000, 36000, 6, 8, 6, 4 },
+ { LEVEL_5_1, 588251136, 8912896, 120000, 46000, 8, 8, 10, 4 },
// TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
- // they are finalized (currently TBD).
- { LEVEL_5_2, 1176502272, 8912896, 180000, 0, 8, 8, 10, 4 },
- { LEVEL_6, 1176502272, 35651584, 180000, 0, 8, 16, 10, 4 },
- { LEVEL_6_1, 2353004544u, 35651584, 240000, 0, 8, 16, 10, 4 },
- { LEVEL_6_2, 4706009088u, 35651584, 480000, 0, 8, 16, 10, 4 },
+ // they are finalized (currently tentative).
+ { LEVEL_5_2, 1176502272, 8912896, 180000, 90000, 8, 8, 10, 4 },
+ { LEVEL_6, 1176502272, 35651584, 180000, 90000, 8, 16, 10, 4 },
+ { LEVEL_6_1, 2353004544u, 35651584, 240000, 180000, 8, 16, 10, 4 },
+ { LEVEL_6_2, 4706009088u, 35651584, 480000, 360000, 8, 16, 10, 4 },
};
/* clang-format on */
@@ -2390,7 +2718,9 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
#if CONFIG_VP9_HIGHBITDEPTH
static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst, int bd) {
+ YV12_BUFFER_CONFIG *dst, int bd,
+ INTERP_FILTER filter_type,
+ int phase_scaler) {
const int src_w = src->y_crop_width;
const int src_h = src->y_crop_height;
const int dst_w = dst->y_crop_width;
@@ -2400,7 +2730,7 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
- const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+ const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
int x, y, i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -2408,16 +2738,17 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
const int src_stride = src_strides[i];
const int dst_stride = dst_strides[i];
for (y = 0; y < dst_h; y += 16) {
- const int y_q4 = y * (16 / factor) * src_h / dst_h;
+ const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
for (x = 0; x < dst_w; x += 16) {
- const int x_q4 = x * (16 / factor) * src_w / dst_w;
+ const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
const uint8_t *src_ptr = srcs[i] +
(y / factor) * src_h / dst_h * src_stride +
(x / factor) * src_w / dst_w;
uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
- vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+ vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+ CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
kernel[y_q4 & 0xf], 16 * src_h / dst_h,
16 / factor, 16 / factor, bd);
@@ -2618,6 +2949,10 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
struct loopfilter *lf = &cm->lf;
+ const int is_reference_frame =
+ (cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
+ cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame);
+
if (xd->lossless) {
lf->filter_level = 0;
lf->last_filt_level = 0;
@@ -2643,7 +2978,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
}
- if (lf->filter_level > 0) {
+ if (lf->filter_level > 0 && is_reference_frame) {
vp9_build_mask_frame(cm, lf->filter_level, 0);
if (cpi->num_workers > 1)
@@ -2708,7 +3043,8 @@ void vp9_scale_references(VP9_COMP *cpi) {
cm->byte_alignment, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
- scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
+ scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
+ EIGHTTAP, 0);
cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
alloc_frame_mvs(cm, new_fb);
}
@@ -2731,7 +3067,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
cm->byte_alignment, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
- vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf);
+ vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0);
cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
alloc_frame_mvs(cm, new_fb);
}
@@ -3118,6 +3454,15 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
uint8_t *dest) {
VP9_COMMON *const cm = &cpi->common;
int q = 0, bottom_index = 0, top_index = 0; // Dummy variables.
+ const INTERP_FILTER filter_scaler =
+ (is_one_pass_cbr_svc(cpi))
+ ? cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id]
+ : EIGHTTAP;
+ const int phase_scaler =
+ (is_one_pass_cbr_svc(cpi))
+ ? cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id]
+ : 0;
+
// Flag to check if its valid to compute the source sad (used for
// scene detection and for superblock content state in CBR mode).
// The flag may get reset below based on SVC or resizing state.
@@ -3136,8 +3481,11 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
// For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take
// advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2
// result will be saved in scaled_temp and might be used later.
+ const INTERP_FILTER filter_scaler2 = cpi->svc.downsample_filter_type[1];
+ const int phase_scaler2 = cpi->svc.downsample_filter_phase[1];
cpi->Source = vp9_svc_twostage_scale(
- cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp);
+ cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp,
+ filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
cpi->svc.scaled_one_half = 1;
} else if (is_one_pass_cbr_svc(cpi) &&
cpi->un_scaled_source->y_width == cm->width << 1 &&
@@ -3149,16 +3497,17 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
cpi->svc.scaled_one_half = 0;
} else {
cpi->Source = vp9_scale_if_required(
- cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0));
+ cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0),
+ filter_scaler, phase_scaler);
}
// Unfiltered raw source used in metrics calculation if the source
// has been filtered.
if (is_psnr_calc_enabled(cpi)) {
#ifdef ENABLE_KF_DENOISE
if (is_spatial_denoise_enabled(cpi)) {
- cpi->raw_source_frame =
- vp9_scale_if_required(cm, &cpi->raw_unscaled_source,
- &cpi->raw_scaled_source, (cpi->oxcf.pass == 0));
+ cpi->raw_source_frame = vp9_scale_if_required(
+ cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
+ (cpi->oxcf.pass == 0), EIGHTTAP, phase_scaler);
} else {
cpi->raw_source_frame = cpi->Source;
}
@@ -3190,9 +3539,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION ||
(cpi->noise_estimate.enabled && !cpi->oxcf.noise_sensitivity) ||
cpi->compute_source_sad_onepass))
- cpi->Last_Source =
- vp9_scale_if_required(cm, cpi->unscaled_last_source,
- &cpi->scaled_last_source, (cpi->oxcf.pass == 0));
+ cpi->Last_Source = vp9_scale_if_required(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+ (cpi->oxcf.pass == 0), EIGHTTAP, 0);
if (cpi->Last_Source == NULL ||
cpi->Last_Source->y_width != cpi->Source->y_width ||
@@ -3214,10 +3563,11 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
cpi->oxcf.content == VP9E_CONTENT_SCREEN))
vp9_scene_detection_onepass(cpi);
- // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference
- // frame (i.e, svc->force_zero_mode_spatial_ref = 0), we can avoid this
- // frame-level upsampling.
- if (frame_is_intra_only(cm) == 0 && !is_one_pass_cbr_svc(cpi)) {
+ // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
+ // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can
+ // avoid this frame-level upsampling (for non intra_only frames).
+ if (frame_is_intra_only(cm) == 0 &&
+ !(is_one_pass_cbr_svc(cpi) && cpi->svc.force_zero_mode_spatial_ref)) {
vp9_scale_references(cpi);
}
@@ -3374,8 +3724,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
&frame_over_shoot_limit);
}
- cpi->Source = vp9_scale_if_required(
- cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0));
+ cpi->Source =
+ vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source,
+ (cpi->oxcf.pass == 0), EIGHTTAP, 0);
// Unfiltered raw source used in metrics calculation if the source
// has been filtered.
@@ -3384,7 +3735,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
if (is_spatial_denoise_enabled(cpi)) {
cpi->raw_source_frame = vp9_scale_if_required(
cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
- (cpi->oxcf.pass == 0));
+ (cpi->oxcf.pass == 0), EIGHTTAP, 0);
} else {
cpi->raw_source_frame = cpi->Source;
}
@@ -3394,9 +3745,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
}
if (cpi->unscaled_last_source != NULL)
- cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
- &cpi->scaled_last_source,
- (cpi->oxcf.pass == 0));
+ cpi->Last_Source = vp9_scale_if_required(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+ (cpi->oxcf.pass == 0), EIGHTTAP, 0);
if (frame_is_intra_only(cm) == 0) {
if (loop_count > 0) {
@@ -3625,6 +3976,15 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
#endif
if (enable_acl) {
+ // Skip recoding, if model diff is below threshold
+ const int thresh = compute_context_model_thresh(cpi);
+ const int diff = compute_context_model_diff(cm);
+ if (diff < thresh) {
+ vpx_clear_system_state();
+ restore_coding_context(cpi);
+ return;
+ }
+
vp9_encode_frame(cpi);
vpx_clear_system_state();
restore_coding_context(cpi);
@@ -3674,23 +4034,28 @@ static void set_ext_overrides(VP9_COMP *cpi) {
}
}
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
- YV12_BUFFER_CONFIG *unscaled,
- YV12_BUFFER_CONFIG *scaled,
- YV12_BUFFER_CONFIG *scaled_temp) {
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
+ VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+ int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
cm->mi_rows * MI_SIZE != unscaled->y_height) {
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->bit_depth == VPX_BITS_8) {
- vp9_scale_and_extend_frame(unscaled, scaled_temp);
- vp9_scale_and_extend_frame(scaled_temp, scaled);
+ vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+ phase_scaler2);
+ vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
+ phase_scaler);
} else {
- scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth);
- scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth);
+ scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
+ filter_type2, phase_scaler2);
+ scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
+ filter_type, phase_scaler);
}
#else
- vp9_scale_and_extend_frame(unscaled, scaled_temp);
- vp9_scale_and_extend_frame(scaled_temp, scaled);
+ vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+ phase_scaler2);
+ vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
#endif // CONFIG_VP9_HIGHBITDEPTH
return scaled;
} else {
@@ -3698,25 +4063,25 @@ YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
}
}
-YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
- YV12_BUFFER_CONFIG *unscaled,
- YV12_BUFFER_CONFIG *scaled,
- int use_normative_scaler) {
+YV12_BUFFER_CONFIG *vp9_scale_if_required(
+ VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) {
if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
cm->mi_rows * MI_SIZE != unscaled->y_height) {
#if CONFIG_VP9_HIGHBITDEPTH
if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
unscaled->y_height <= (scaled->y_height << 1))
if (cm->bit_depth == VPX_BITS_8)
- vp9_scale_and_extend_frame(unscaled, scaled);
+ vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
else
- scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth);
+ scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
+ filter_type, phase_scaler);
else
scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
#else
if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
unscaled->y_height <= (scaled->y_height << 1))
- vp9_scale_and_extend_frame(unscaled, scaled);
+ vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
else
scale_and_extend_frame_nonnormative(unscaled, scaled);
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -4049,12 +4414,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
++cm->current_video_frame;
cpi->ext_refresh_frame_flags_pending = 0;
cpi->svc.rc_drop_superframe = 1;
+ cpi->last_frame_dropped = 1;
// TODO(marpan): Advancing the svc counters on dropped frames can break
// the referencing scheme for the fixed svc patterns defined in
// vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but
// for now, don't advance the svc frame counters on dropped frame.
// if (cpi->use_svc)
// vp9_inc_frame_in_layer(cpi);
+
return;
}
}
@@ -4072,6 +4439,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
encode_with_recode_loop(cpi, size, dest);
}
+ cpi->last_frame_dropped = 0;
+
// Disable segmentation if it decrease rate/distortion ratio
if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ)
vp9_try_disable_lookahead_aq(cpi, size, dest);
@@ -5261,4 +5630,9 @@ void vp9_set_row_mt(VP9_COMP *cpi) {
if (cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cpi->oxcf.row_mt) {
cpi->row_mt = 1;
}
+
+ if (cpi->row_mt && cpi->oxcf.max_threads > 1)
+ cpi->row_mt_bit_exact = 1;
+ else
+ cpi->row_mt_bit_exact = 0;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
index 6c1cb6073e8..672c83bfdf9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
@@ -137,6 +137,7 @@ typedef enum {
kLowSadHighSumdiff = 2,
kHighSadLowSumdiff = 3,
kHighSadHighSumdiff = 4,
+ kLowVarHighSumdiff = 5,
} CONTENT_STATE_SB;
typedef struct VP9EncoderConfig {
@@ -268,7 +269,6 @@ typedef struct VP9EncoderConfig {
VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
int row_mt;
- unsigned int row_mt_bit_exact;
unsigned int motion_vector_unit_test;
} VP9EncoderConfig;
@@ -281,17 +281,11 @@ typedef struct TileDataEnc {
TileInfo tile_info;
int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
int mode_map[BLOCK_SIZES][MAX_MODES];
- int m_search_count;
- int ex_search_count;
FIRSTPASS_DATA fp_data;
VP9RowMTSync row_mt_sync;
// Used for adaptive_rd_thresh with row multithreading
int *row_base_thresh_freq_fact;
-#if CONFIG_MULTITHREAD
- pthread_mutex_t *search_count_mutex;
- pthread_mutex_t *enc_row_mt_mutex;
-#endif
} TileDataEnc;
typedef struct RowMTInfo {
@@ -695,7 +689,9 @@ typedef struct VP9_COMP {
void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int);
void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int);
ARNRFilterData arnr_filter_data;
+
int row_mt;
+ unsigned int row_mt_bit_exact;
// Previous Partition Info
BLOCK_SIZE *prev_partition;
@@ -708,6 +704,8 @@ typedef struct VP9_COMP {
uint8_t *prev_variance_low;
uint8_t *copied_frame_cnt;
uint8_t max_copied_frame;
+ // If the last frame is dropped, we don't copy partition.
+ uint8_t last_frame_dropped;
// For each superblock: keeps track of the last time (in frame distance) the
// the superblock did not have low source sad.
@@ -840,15 +838,14 @@ void vp9_update_reference_frames(VP9_COMP *cpi);
void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
- YV12_BUFFER_CONFIG *unscaled,
- YV12_BUFFER_CONFIG *scaled,
- YV12_BUFFER_CONFIG *scaled_temp);
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
+ VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+ int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2);
-YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
- YV12_BUFFER_CONFIG *unscaled,
- YV12_BUFFER_CONFIG *scaled,
- int use_normative_scaler);
+YV12_BUFFER_CONFIG *vp9_scale_if_required(
+ VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler);
void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c
index 681e960c8df..51664112a44 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c
@@ -552,7 +552,6 @@ static int enc_row_mt_worker_hook(EncWorkerData *const thread_data,
const VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
int tile_row, tile_col;
- TileDataEnc *this_tile;
int end_of_frame;
int thread_id = thread_data->thread_id;
int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
@@ -574,13 +573,6 @@ static int enc_row_mt_worker_hook(EncWorkerData *const thread_data,
tile_row = proc_job->tile_row_id;
mi_row = proc_job->vert_unit_row_num * MI_BLOCK_SIZE;
- this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
- thread_data->td->mb.m_search_count_ptr = &this_tile->m_search_count;
- thread_data->td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
-#if CONFIG_MULTITHREAD
- thread_data->td->mb.search_count_mutex = this_tile->search_count_mutex;
-#endif
-
vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row);
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
index 222e27a9f26..b6e3275482c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
@@ -42,15 +42,12 @@
#define OUTPUT_FPF 0
#define ARF_STATS_OUTPUT 0
-#define FACTOR_PT_LOW 0.70
-#define FACTOR_PT_HIGH 0.90
#define FIRST_PASS_Q 10.0
#define GF_MAX_BOOST 96.0
#define INTRA_MODE_PENALTY 1024
#define MIN_ARF_GF_BOOST 240
#define MIN_DECAY_FACTOR 0.01
#define NEW_MV_MODE_PENALTY 32
-#define SVC_FACTOR_PT_LOW 0.45
#define DARK_THRESH 64
#define DEFAULT_GRP_WEIGHT 1.0
#define RC_FACTOR_MIN 0.75
@@ -241,14 +238,14 @@ static double calculate_active_area(const VP9_COMP *cpi,
// Calculate a modified Error used in distributing bits between easier and
// harder frames.
#define ACT_AREA_CORRECTION 0.5
-static double calculate_modified_err(const VP9_COMP *cpi,
- const TWO_PASS *twopass,
- const VP9EncoderConfig *oxcf,
- const FIRSTPASS_STATS *this_frame) {
+static double calculate_mod_frame_score(const VP9_COMP *cpi,
+ const TWO_PASS *twopass,
+ const VP9EncoderConfig *oxcf,
+ const FIRSTPASS_STATS *this_frame) {
const FIRSTPASS_STATS *const stats = &twopass->total_stats;
const double av_weight = stats->weight / stats->count;
const double av_err = (stats->coded_error * av_weight) / stats->count;
- double modified_error =
+ double modified_score =
av_err * pow(this_frame->coded_error * this_frame->weight /
DOUBLE_DIVIDE_CHECK(av_err),
oxcf->two_pass_vbrbias / 100.0);
@@ -258,11 +255,38 @@ static double calculate_modified_err(const VP9_COMP *cpi,
// remaining active MBs. The correction here assumes that coding
// 0.5N blocks of complexity 2X is a little easier than coding N
// blocks of complexity X.
- modified_error *=
+ modified_score *=
pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
- return fclamp(modified_error, twopass->modified_error_min,
- twopass->modified_error_max);
+ return modified_score;
+}
+static double calculate_norm_frame_score(const VP9_COMP *cpi,
+ const TWO_PASS *twopass,
+ const VP9EncoderConfig *oxcf,
+ const FIRSTPASS_STATS *this_frame) {
+ const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+ const double av_weight = stats->weight / stats->count;
+ const double av_err = (stats->coded_error * av_weight) / stats->count;
+ double modified_score =
+ av_err * pow(this_frame->coded_error * this_frame->weight /
+ DOUBLE_DIVIDE_CHECK(av_err),
+ oxcf->two_pass_vbrbias / 100.0);
+
+ const double min_score = (double)(oxcf->two_pass_vbrmin_section) / 100.0;
+ const double max_score = (double)(oxcf->two_pass_vbrmax_section) / 100.0;
+
+ // Correction for active area. Frames with a reduced active area
+ // (eg due to formatting bars) have a higher error per mb for the
+ // remaining active MBs. The correction here assumes that coding
+ // 0.5N blocks of complexity 2X is a little easier than coding N
+ // blocks of complexity X.
+ modified_score *=
+ pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+ // Normalize to a midpoint score.
+ modified_score /= DOUBLE_DIVIDE_CHECK(twopass->mean_mod_score);
+
+ return fclamp(modified_score, min_score, max_score);
}
// This function returns the maximum target rate per frame.
@@ -710,9 +734,14 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
fps->frame = cm->current_video_frame;
fps->spatial_layer_id = cpi->svc.spatial_layer_id;
- fps->coded_error = (double)(fp_acc_data->coded_error >> 8) + min_err;
- fps->sr_coded_error = (double)(fp_acc_data->sr_coded_error >> 8) + min_err;
- fps->intra_error = (double)(fp_acc_data->intra_error >> 8) + min_err;
+
+ fps->coded_error =
+ ((double)(fp_acc_data->coded_error >> 8) + min_err) / num_mbs;
+ fps->sr_coded_error =
+ ((double)(fp_acc_data->sr_coded_error >> 8) + min_err) / num_mbs;
+ fps->intra_error =
+ ((double)(fp_acc_data->intra_error >> 8) + min_err) / num_mbs;
+
fps->frame_noise_energy =
(double)(fp_acc_data->frame_noise_energy) / (double)num_mbs;
fps->count = 1.0;
@@ -979,12 +1008,12 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
if (log_intra < 10.0) {
mb_intra_factor = 1.0 + ((10.0 - log_intra) * 0.05);
fp_acc_data->intra_factor += mb_intra_factor;
- if (cpi->oxcf.row_mt_bit_exact)
+ if (cpi->row_mt_bit_exact)
cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor =
mb_intra_factor;
} else {
fp_acc_data->intra_factor += 1.0;
- if (cpi->oxcf.row_mt_bit_exact)
+ if (cpi->row_mt_bit_exact)
cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0;
}
@@ -999,12 +1028,12 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
mb_brightness_factor = 1.0 + (0.01 * (DARK_THRESH - level_sample));
fp_acc_data->brightness_factor += mb_brightness_factor;
- if (cpi->oxcf.row_mt_bit_exact)
+ if (cpi->row_mt_bit_exact)
cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor =
mb_brightness_factor;
} else {
fp_acc_data->brightness_factor += 1.0;
- if (cpi->oxcf.row_mt_bit_exact)
+ if (cpi->row_mt_bit_exact)
cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor =
1.0;
}
@@ -1166,7 +1195,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
(this_error < (2 * intrapenalty))) {
fp_acc_data->neutral_count += 1.0;
- if (cpi->oxcf.row_mt_bit_exact)
+ if (cpi->row_mt_bit_exact)
cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count =
1.0;
// Also track cases where the intra is not much worse than the inter
@@ -1176,7 +1205,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
mb_neutral_count =
(double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
fp_acc_data->neutral_count += mb_neutral_count;
- if (cpi->oxcf.row_mt_bit_exact)
+ if (cpi->row_mt_bit_exact)
cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count =
mb_neutral_count;
}
@@ -1400,7 +1429,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
(cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
- &cpi->scaled_source, 0);
+ &cpi->scaled_source, 0, EIGHTTAP, 0);
}
vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
@@ -1424,7 +1453,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
cm->log2_tile_rows = 0;
- if (cpi->oxcf.row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL)
+ if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL)
CHECK_MEM_ERROR(
cm, cpi->twopass.fp_mb_float_stats,
vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
@@ -1441,13 +1470,13 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
} else {
cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;
cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;
- if (cpi->oxcf.row_mt_bit_exact) {
+ if (cpi->row_mt_bit_exact) {
cm->log2_tile_cols = 0;
vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs);
}
vp9_encode_fp_row_mt(cpi);
first_tile_col = &cpi->tile_data[0];
- if (cpi->oxcf.row_mt_bit_exact)
+ if (cpi->row_mt_bit_exact)
accumulate_floating_point_stats(cpi, first_tile_col);
first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data));
}
@@ -1522,14 +1551,22 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);
}
+static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = {
+ 0.65, 0.70, 0.75, 0.85, 0.90, 0.90, 0.90, 1.00, 1.25
+};
+
static double calc_correction_factor(double err_per_mb, double err_divisor,
- double pt_low, double pt_high, int q,
- vpx_bit_depth_t bit_depth) {
- const double error_term = err_per_mb / err_divisor;
+ int q) {
+ const double error_term = err_per_mb / DOUBLE_DIVIDE_CHECK(err_divisor);
+ const int index = q >> 5;
+ double power_term;
+
+ assert((index >= 0) && (index < (QINDEX_RANGE >> 5)));
- // Adjustment based on actual quantizer to power term.
- const double power_term =
- VPXMIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+ // Adjustment based on quantizer to the power term.
+ power_term =
+ q_pow_term[index] +
+ (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0);
// Calculate correction factor.
if (power_term < 1.0) assert(error_term >= 0.0);
@@ -1560,17 +1597,14 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
? cpi->initial_mbs
: cpi->common.MBs;
- const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
- const double av_err_per_mb = section_err / active_mbs;
+ const double active_pct = VPXMAX(0.01, 1.0 - inactive_zone);
+ const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct);
+ const double av_err_per_mb = section_err / active_pct;
const double speed_term = 1.0 + 0.04 * oxcf->speed;
double last_group_rate_err;
const int target_norm_bits_per_mb =
(int)(((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs);
int q;
- int is_svc_upper_layer = 0;
-
- if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
- is_svc_upper_layer = 1;
// based on recent history adjust expectations of bits per macroblock.
last_group_rate_err =
@@ -1583,10 +1617,8 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
// Try and pick a max Q that will be high enough to encode the
// content at the given rate.
for (q = rc->best_quality; q < rc->worst_quality; ++q) {
- const double factor = calc_correction_factor(
- av_err_per_mb, ERR_DIVISOR,
- is_svc_upper_layer ? SVC_FACTOR_PT_LOW : FACTOR_PT_LOW,
- FACTOR_PT_HIGH, q, cpi->common.bit_depth);
+ const double factor =
+ calc_correction_factor(av_err_per_mb, ERR_DIVISOR, q);
const int bits_per_mb = vp9_rc_bits_per_mb(
INTER_FRAME, q,
factor * speed_term * cpi->twopass.bpm_factor * noise_factor,
@@ -1676,22 +1708,35 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
// This variable monitors how far behind the second ref update is lagging.
twopass->sr_update_lag = 1;
- // Scan the first pass file and calculate a modified total error based upon
- // the bias/power function used to allocate bits.
+ // Scan the first pass file and calculate a modified score for each
+ // frame that is used to distribute bits. The modified score is assumed
+ // to provide a linear basis for bit allocation. I.e a frame A with a score
+ // that is double that of frame B will be allocated 2x as many bits.
{
- const double avg_error =
- stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
const FIRSTPASS_STATS *s = twopass->stats_in;
- double modified_error_total = 0.0;
- twopass->modified_error_min =
- (avg_error * oxcf->two_pass_vbrmin_section) / 100;
- twopass->modified_error_max =
- (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+ double modified_score_total = 0.0;
+
+ // The first scan is unclamped and gives a raw average.
+ while (s < twopass->stats_in_end) {
+ modified_score_total += calculate_mod_frame_score(cpi, twopass, oxcf, s);
+ ++s;
+ }
+
+ // The average error from this first scan is used to define the midpoint
+ // error for the rate distribution function.
+ twopass->mean_mod_score =
+ modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count);
+
+ // Second scan using clamps based on the previous cycle average.
+ // This may modify the total and average somewhat but we dont bother with
+ // further itterations.
+ s = twopass->stats_in;
+ modified_score_total = 0.0;
while (s < twopass->stats_in_end) {
- modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+ modified_score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s);
++s;
}
- twopass->modified_error_left = modified_error_total;
+ twopass->normalized_score_left = modified_score_total;
}
// Reset the vbr bits off target counters
@@ -1728,9 +1773,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
static double get_sr_decay_rate(const VP9_COMP *cpi,
const FIRSTPASS_STATS *frame) {
- const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
- : cpi->common.MBs;
- double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+ double sr_diff = (frame->sr_coded_error - frame->coded_error);
double sr_decay = 1.0;
double modified_pct_inter;
double modified_pcnt_intra;
@@ -1739,7 +1782,7 @@ static double get_sr_decay_rate(const VP9_COMP *cpi,
(cpi->initial_height + cpi->initial_width));
modified_pct_inter = frame->pcnt_inter;
- if (((frame->coded_error / num_mbs) > LOW_CODED_ERR_PER_MB) &&
+ if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
(double)NCOUNT_FRAME_II_THRESH)) {
modified_pct_inter =
@@ -1861,20 +1904,16 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
const double lq = vp9_convert_qindex_to_q(
cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
- int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
- : cpi->common.MBs;
-
- // Correct for any inactive region in the image
- num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+ const double active_area = calculate_active_area(cpi, this_frame);
// Underlying boost factor is based on inter error ratio.
- frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+ frame_boost = (BASELINE_ERR_PER_MB * active_area) /
DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
// Update the accumulator for second ref error difference.
// This is intended to give an indication of how much the coded error is
// increasing over time.
- *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+ *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
*sr_accumulator = VPXMAX(0.0, *sr_accumulator);
// Small adjustment for cases where there is a zoom out
@@ -1897,20 +1936,16 @@ static double calc_kf_frame_boost(VP9_COMP *cpi,
const double lq = vp9_convert_qindex_to_q(
cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
- int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
- : cpi->common.MBs;
-
- // Correct for any inactive region in the image
- num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+ const double active_area = calculate_active_area(cpi, this_frame);
// Underlying boost factor is based on inter error ratio.
- frame_boost = (KF_BASELINE_ERR_PER_MB * num_mbs) /
+ frame_boost = (KF_BASELINE_ERR_PER_MB * active_area) /
DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
// Update the accumulator for second ref error difference.
// This is intended to give an indication of how much the coded error is
// increasing over time.
- *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+ *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
*sr_accumulator = VPXMAX(0.0, *sr_accumulator);
// Small adjustment for cases where there is a zoom out
@@ -2043,7 +2078,7 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
int64_t total_group_bits;
// Calculate the bits to be allocated to the group as a whole.
- if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+ if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) {
total_group_bits = (int64_t)(twopass->kf_group_bits *
(gf_group_err / twopass->kf_group_error_left));
} else {
@@ -2337,7 +2372,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
vp9_zero(next_frame);
// Load stats for the current frame.
- mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
// Note the error of the frame at the start of the group. This will be
// the GF frame error if we code a normal gf.
@@ -2370,8 +2405,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
cpi->common.bit_depth));
active_min_gf_interval =
rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200);
- if (active_min_gf_interval > rc->max_gf_interval)
- active_min_gf_interval = rc->max_gf_interval;
+ active_min_gf_interval =
+ VPXMIN(active_min_gf_interval, rc->max_gf_interval + arf_active_or_kf);
if (cpi->multi_arf_allowed) {
active_max_gf_interval = rc->max_gf_interval;
@@ -2382,11 +2417,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// interval to spread the cost of the GF.
active_max_gf_interval = 12 + arf_active_or_kf + VPXMIN(4, (int_lbq / 6));
- // We have: active_min_gf_interval <= rc->max_gf_interval
- if (active_max_gf_interval < active_min_gf_interval)
+ // We have: active_min_gf_interval <=
+ // rc->max_gf_interval + arf_active_or_kf.
+ if (active_max_gf_interval < active_min_gf_interval) {
active_max_gf_interval = active_min_gf_interval;
- else if (active_max_gf_interval > rc->max_gf_interval)
- active_max_gf_interval = rc->max_gf_interval;
+ } else {
+ active_max_gf_interval = VPXMIN(active_max_gf_interval,
+ rc->max_gf_interval + arf_active_or_kf);
+ }
// Would the active max drop us out just before the near the next kf?
if ((active_max_gf_interval <= rc->frames_to_key) &&
@@ -2400,7 +2438,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
++i;
// Accumulate error score of frames in this gf group.
- mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
gf_group_err += mod_frame_err;
gf_group_raw_error += this_frame->coded_error;
gf_group_noise += this_frame->frame_noise_energy;
@@ -2509,7 +2547,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int j;
for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
if (EOF == input_stats(twopass, this_frame)) break;
- gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ gf_group_err +=
+ calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
gf_group_raw_error += this_frame->coded_error;
gf_group_noise += this_frame->frame_noise_energy;
gf_group_skip_pct += this_frame->intra_skip_pct;
@@ -2564,7 +2603,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
gf_group_bits);
// Adjust KF group bits and error remaining.
- twopass->kf_group_error_left -= (int64_t)gf_group_err;
+ twopass->kf_group_error_left -= gf_group_err;
// Allocate bits to each of the frames in the GF group.
allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits);
@@ -2614,6 +2653,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
#define II_IMPROVEMENT_THRESHOLD 3.5
#define KF_II_MAX 128.0
#define II_FACTOR 12.5
+// Test for very low intra complexity which could cause false key frames
+#define V_LOW_INTRA 0.5
+
static int test_candidate_kf(TWO_PASS *twopass,
const FIRSTPASS_STATS *last_frame,
const FIRSTPASS_STATS *this_frame,
@@ -2672,7 +2714,7 @@ static int test_candidate_kf(TWO_PASS *twopass,
0.20) &&
(next_iiratio < 3.0)) ||
((boost_score - old_boost_score) < 3.0) ||
- (local_next_frame.intra_error < 200)) {
+ (local_next_frame.intra_error < V_LOW_INTRA)) {
break;
}
@@ -2748,10 +2790,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
rc->frames_to_key = 1;
- twopass->kf_group_bits = 0; // Total bits available to kf group
- twopass->kf_group_error_left = 0; // Group modified error score.
+ twopass->kf_group_bits = 0; // Total bits available to kf group
+ twopass->kf_group_error_left = 0.0; // Group modified error score.
- kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ kf_mod_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
// Initialize the decay rates for the recent frames to check
for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
@@ -2761,7 +2803,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
while (twopass->stats_in < twopass->stats_in_end &&
rc->frames_to_key < cpi->oxcf.key_freq) {
// Accumulate kf group error.
- kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
// Load the next frame's stats.
last_frame = *this_frame;
@@ -2821,7 +2863,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Rescan to get the correct error data for the forced kf group.
for (i = 0; i < rc->frames_to_key; ++i) {
- kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+ kf_group_err +=
+ calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame);
input_stats(twopass, &tmp_frame);
}
rc->next_key_frame_forced = 1;
@@ -2838,7 +2881,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int j;
for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
if (EOF == input_stats(twopass, this_frame)) break;
- kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ kf_group_err +=
+ calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
}
rc->frames_to_key = new_frame_to_key;
}
@@ -2846,11 +2890,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Special case for the last key frame of the file.
if (twopass->stats_in >= twopass->stats_in_end) {
// Accumulate kf group error.
- kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
}
// Calculate the number of bits that should be assigned to the kf group.
- if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+ if (twopass->bits_left > 0 && twopass->normalized_score_left > 0.0) {
// Maximum number of bits for a single normal frame (not key frame).
const int max_bits = frame_max_bits(rc, &cpi->oxcf);
@@ -2860,7 +2904,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Default allocation based on bits left and relative
// complexity of the section.
twopass->kf_group_bits = (int64_t)(
- twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+ twopass->bits_left * (kf_group_err / twopass->normalized_score_left));
// Clip based on maximum per frame rate defined by the user.
max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
@@ -2933,12 +2977,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
gf_group->rf_level[0] = KF_STD;
// Note the total error score of the kf group minus the key frame itself.
- twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+ twopass->kf_group_error_left = (kf_group_err - kf_mod_err);
// Adjust the count of total modified error left.
// The count of bits left is adjusted elsewhere based on real coded frame
// sizes.
- twopass->modified_error_left -= kf_group_err;
+ twopass->normalized_score_left -= kf_group_err;
if (oxcf->resize_mode == RESIZE_DYNAMIC) {
// Default to normal-sized frame on keyframes.
@@ -3170,16 +3214,10 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
target_rate = gf_group->bit_allocation[gf_group->index];
rc->base_frame_target = target_rate;
- {
- const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
- ? cpi->initial_mbs
- : cpi->common.MBs;
- // The multiplication by 256 reverses a scaling factor of (>> 8)
- // applied when combining MB error values for the frame.
- twopass->mb_av_energy =
- log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
- twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
- }
+ // The multiplication by 256 reverses a scaling factor of (>> 8)
+ // applied when combining MB error values for the frame.
+ twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
+ twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
// Update the total stats remaining structure.
subtract_stats(&twopass->total_left_stats, &this_frame);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
index d660aa1ffb8..000ecd77926 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
@@ -138,9 +138,8 @@ typedef struct {
FIRSTPASS_STATS total_left_stats;
int first_pass_done;
int64_t bits_left;
- double modified_error_min;
- double modified_error_max;
- double modified_error_left;
+ double mean_mod_score;
+ double normalized_score_left;
double mb_av_energy;
double mb_smooth_pct;
@@ -159,7 +158,7 @@ typedef struct {
int64_t kf_group_bits;
// Error score of frames still to be coded in kf group
- int64_t kf_group_error_left;
+ double kf_group_error_left;
double bpm_factor;
int rolling_arf_group_target_bits;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_frame_scale.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_frame_scale.c
index 349e7bd41d8..e58628388f0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_frame_scale.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_frame_scale.c
@@ -16,7 +16,8 @@
#include "vpx_scale/yv12config.h"
void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst) {
+ YV12_BUFFER_CONFIG *dst,
+ INTERP_FILTER filter_type, int phase_scaler) {
const int src_w = src->y_crop_width;
const int src_h = src->y_crop_height;
const int dst_w = dst->y_crop_width;
@@ -26,7 +27,7 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
- const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+ const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
int x, y, i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -34,9 +35,9 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
const int src_stride = src_strides[i];
const int dst_stride = dst_strides[i];
for (y = 0; y < dst_h; y += 16) {
- const int y_q4 = y * (16 / factor) * src_h / dst_h;
+ const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
for (x = 0; x < dst_w; x += 16) {
- const int x_q4 = x * (16 / factor) * src_w / dst_w;
+ const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
const uint8_t *src_ptr = srcs[i] +
(y / factor) * src_h / dst_h * src_stride +
(x / factor) * src_w / dst_w;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
index a3939a5f85d..24e23af3b15 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
@@ -1998,18 +1998,6 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
int range = sf->mesh_patterns[0].range;
int baseline_interval_divisor;
-#if CONFIG_MULTITHREAD
- if (NULL != x->search_count_mutex) pthread_mutex_lock(x->search_count_mutex);
-#endif
-
- // Keep track of number of exhaustive calls (this frame in this thread).
- ++(*x->ex_search_count_ptr);
-
-#if CONFIG_MULTITHREAD
- if (NULL != x->search_count_mutex)
- pthread_mutex_unlock(x->search_count_mutex);
-#endif
-
// Trap illegal values for interval and range for this function.
if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
(interval > range))
@@ -2367,32 +2355,6 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
return best_sad;
}
-#define MIN_EX_SEARCH_LIMIT 128
-static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) {
- const SPEED_FEATURES *const sf = &cpi->sf;
- int is_exhaustive_allowed;
- int max_ex;
-
-#if CONFIG_MULTITHREAD
- if (NULL != x->search_count_mutex) pthread_mutex_lock(x->search_count_mutex);
-#endif
-
- max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
- (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
-
- is_exhaustive_allowed = sf->allow_exhaustive_searches &&
- (sf->exhaustive_searches_thresh < INT_MAX) &&
- (*x->ex_search_count_ptr <= max_ex) &&
- !cpi->rc.is_src_frame_alt_ref;
-
-#if CONFIG_MULTITHREAD
- if (NULL != x->search_count_mutex)
- pthread_mutex_unlock(x->search_count_mutex);
-#endif
-
- return is_exhaustive_allowed;
-}
-
int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
MV *mvp_full, int step_param, int search_method,
int error_per_bit, int *cost_list, const MV *ref_mv,
@@ -2435,21 +2397,9 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
MAX_MVSEARCH_STEPS - 1 - step_param, 1,
cost_list, fn_ptr, ref_mv, tmp_mv);
-#if CONFIG_MULTITHREAD
- if (NULL != x->search_count_mutex)
- pthread_mutex_lock(x->search_count_mutex);
-#endif
-
- // Keep track of number of searches (this frame in this thread).
- ++(*x->m_search_count_ptr);
-
-#if CONFIG_MULTITHREAD
- if (NULL != x->search_count_mutex)
- pthread_mutex_unlock(x->search_count_mutex);
-#endif
-
// Should we allow a follow on exhaustive search?
- if (is_exhaustive_allowed(cpi, x)) {
+ if ((sf->exhaustive_searches_thresh < INT_MAX) &&
+ !cpi->rc.is_src_frame_alt_ref) {
int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
exhuastive_thr >>=
8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_multi_thread.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_multi_thread.c
index f5d8e430c8a..da06fb151d8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_multi_thread.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_multi_thread.c
@@ -110,24 +110,6 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
multi_thread_ctxt->num_tile_vert_sbs[tile_row] =
get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
}
-
-#if CONFIG_MULTITHREAD
- for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- for (tile_col = 0; tile_col < tile_cols; tile_col++) {
- TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
-
- CHECK_MEM_ERROR(cm, this_tile->search_count_mutex,
- vpx_malloc(sizeof(*this_tile->search_count_mutex)));
-
- pthread_mutex_init(this_tile->search_count_mutex, NULL);
-
- CHECK_MEM_ERROR(cm, this_tile->enc_row_mt_mutex,
- vpx_malloc(sizeof(*this_tile->enc_row_mt_mutex)));
-
- pthread_mutex_init(this_tile->enc_row_mt_mutex, NULL);
- }
- }
-#endif
}
void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
@@ -170,12 +152,6 @@ void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
this_tile->row_base_thresh_freq_fact = NULL;
}
}
- pthread_mutex_destroy(this_tile->search_count_mutex);
- vpx_free(this_tile->search_count_mutex);
- this_tile->search_count_mutex = NULL;
- pthread_mutex_destroy(this_tile->enc_row_mt_mutex);
- vpx_free(this_tile->enc_row_mt_mutex);
- this_tile->enc_row_mt_mutex = NULL;
}
}
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c
index fc2e32448e8..e2239b44b0f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c
@@ -26,25 +26,27 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
ne->level = kLowLow;
ne->value = 0;
ne->count = 0;
- ne->thresh = 100;
+ ne->thresh = 90;
ne->last_w = 0;
ne->last_h = 0;
if (width * height >= 1920 * 1080) {
ne->thresh = 200;
} else if (width * height >= 1280 * 720) {
ne->thresh = 140;
+ } else if (width * height >= 640 * 360) {
+ ne->thresh = 100;
}
- ne->num_frames_estimate = 20;
+ ne->num_frames_estimate = 15;
}
static int enable_noise_estimation(VP9_COMP *const cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
if (cpi->common.use_highbitdepth) return 0;
#endif
-// Enable noise estimation if denoising is on, but not for low resolutions.
+// Enable noise estimation if denoising is on.
#if CONFIG_VP9_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
- cpi->common.width >= 640 && cpi->common.height >= 360)
+ cpi->common.width >= 320 && cpi->common.height >= 180)
return 1;
#endif
// Only allow noise estimate under certain encoding mode.
@@ -97,6 +99,7 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
void vp9_update_noise_estimate(VP9_COMP *const cpi) {
const VP9_COMMON *const cm = &cpi->common;
NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+ const int low_res = (cm->width <= 352 && cm->height <= 288);
// Estimate of noise level every frame_period frames.
int frame_period = 8;
int thresh_consec_zeromv = 6;
@@ -108,8 +111,17 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
// Estimate is between current source and last source.
YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) {
last_source = &cpi->denoiser.last_source;
+ // Tune these thresholds for different resolutions when denoising is
+ // enabled.
+ if (cm->width > 640 && cm->width < 1920) {
+ thresh_consec_zeromv = 4;
+ thresh_sum_diff = 200;
+ thresh_sum_spatial = (120 * 120) << 8;
+ thresh_spatial_var = (48 * 48) << 8;
+ }
+ }
#endif
ne->enabled = enable_noise_estimation(cpi);
if (cpi->svc.number_spatial_layers > 1)
@@ -127,9 +139,12 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
ne->last_h = cm->height;
}
return;
- } else if (cpi->rc.avg_frame_low_motion < 50) {
+ } else if (cm->current_video_frame > 60 &&
+ cpi->rc.avg_frame_low_motion < (low_res ? 70 : 50)) {
// Force noise estimation to 0 and denoiser off if content has high motion.
ne->level = kLowLow;
+ ne->count = 0;
+ ne->num_frames_estimate = 10;
#if CONFIG_VP9_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
cpi->svc.current_superframe > 1) {
@@ -210,7 +225,8 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
// Avoid blocks with high brightness and high spatial variance.
if ((sse2 - spatial_variance) < thresh_sum_spatial &&
spatial_variance < thresh_spatial_var) {
- avg_est += variance / ((spatial_variance >> 9) + 1);
+ avg_est += low_res ? variance >> 4
+ : variance / ((spatial_variance >> 9) + 1);
num_samples++;
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
index db2bbe7c272..b05f4184bd0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
@@ -170,6 +170,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
}
vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
+ // Limit motion vector for large lightning change.
+ if (cpi->oxcf.speed > 5 && x->lowvar_highsumdiff) {
+ x->mv_limits.col_min = VPXMAX(x->mv_limits.col_min, -10);
+ x->mv_limits.row_min = VPXMAX(x->mv_limits.row_min, -10);
+ x->mv_limits.col_max = VPXMIN(x->mv_limits.col_max, 10);
+ x->mv_limits.row_max = VPXMIN(x->mv_limits.row_max, 10);
+ }
+
assert(x->mv_best_ref_index[ref] <= 2);
if (x->mv_best_ref_index[ref] < 2)
mvp_full = x->mbmi_ext->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
@@ -203,9 +211,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
!(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar);
if (rv) {
- const int subpel_force_stop = use_base_mv && cpi->sf.base_mv_aggressive
- ? 2
- : cpi->sf.mv.subpel_force_stop;
+ const int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
cpi->find_fractional_mv_step(
x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop,
@@ -354,9 +360,11 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
*sse_y = sse;
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0 && cpi->oxcf.speed > 5)
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ cpi->oxcf.speed > 5)
ac_thr = vp9_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level,
- (abs(sum) >> (bw + bh)));
+ (abs(sum) >> (bw + bh)),
+ cpi->svc.temporal_layer_id);
else
ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
cpi->common.height, abs(sum) >> (bw + bh));
@@ -452,28 +460,32 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
// Transform skipping test in UV planes.
for (i = 1; i <= 2; i++) {
- struct macroblock_plane *const p = &x->plane[i];
- struct macroblockd_plane *const pd = &xd->plane[i];
- const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
- const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
- const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
- const int uv_bw = b_width_log2_lookup[uv_bsize];
- const int uv_bh = b_height_log2_lookup[uv_bsize];
- const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
- (uv_bh - b_height_log2_lookup[unit_size]);
- const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
- const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
- int j = i - 1;
-
- vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
- var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
- p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
-
- if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
- (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
- skip_uv[j] = 1;
- else
- break;
+ if (cpi->oxcf.speed < 8 || x->color_sensitivity[i - 1]) {
+ struct macroblock_plane *const p = &x->plane[i];
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
+ const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
+ const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+ const int uv_bw = b_width_log2_lookup[uv_bsize];
+ const int uv_bh = b_height_log2_lookup[uv_bsize];
+ const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
+ (uv_bh - b_height_log2_lookup[unit_size]);
+ const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
+ const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+ int j = i - 1;
+
+ vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+ var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
+ p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+
+ if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+ (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+ skip_uv[j] = 1;
+ else
+ break;
+ } else {
+ skip_uv[i - 1] = 1;
+ }
}
// If the transform in YUV planes are skippable, the mode search checks
@@ -481,7 +493,6 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
if (skip_uv[0] & skip_uv[1]) {
*early_term = 1;
}
-
return;
}
@@ -616,7 +627,7 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
int *skippable, int64_t *sse, BLOCK_SIZE bsize,
- TX_SIZE tx_size) {
+ TX_SIZE tx_size, int rd_computed) {
MACROBLOCKD *xd = &x->e_mbd;
const struct macroblockd_plane *pd = &xd->plane[0];
struct macroblock_plane *const p = &x->plane[0];
@@ -643,8 +654,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
bsize < BLOCK_32X32)) {
unsigned int var_y, sse_y;
(void)tx_size;
- model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
- &var_y, &sse_y);
+ if (!rd_computed)
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
+ &var_y, &sse_y);
*sse = INT_MAX;
*skippable = 0;
return;
@@ -655,8 +667,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
bsize < BLOCK_32X32) {
unsigned int var_y, sse_y;
(void)tx_size;
- model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
- &var_y, &sse_y);
+ if (!rd_computed)
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
+ &var_y, &sse_y);
*sse = INT_MAX;
*skippable = 0;
return;
@@ -978,7 +991,7 @@ static void estimate_block_intra(int plane, int block, int row, int col,
int64_t this_sse = INT64_MAX;
// TODO(jingning): This needs further refactoring.
block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx,
- VPXMIN(tx_size, TX_16X16));
+ VPXMIN(tx_size, TX_16X16), 0);
} else {
unsigned int var = 0;
unsigned int sse = 0;
@@ -1151,8 +1164,8 @@ static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
{ ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV }
};
static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = {
- { LAST_FRAME, ZEROMV }, { GOLDEN_FRAME, ZEROMV },
- { LAST_FRAME, NEARESTMV }, { LAST_FRAME, NEARMV },
+ { LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV },
+ { LAST_FRAME, NEARMV }, { GOLDEN_FRAME, ZEROMV },
{ GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV },
{ LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEWMV }
};
@@ -1216,7 +1229,8 @@ static INLINE void find_predictors(
static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd,
PREDICTION_MODE this_mode, RD_COST *this_rdc,
BLOCK_SIZE bsize, int mv_row, int mv_col,
- int is_last_frame) {
+ int is_last_frame, int lowvar_highsumdiff,
+ int is_skin) {
// Bias against MVs associated with NEWMV mode that are very different from
// top/left neighbors.
if (this_mode == NEWMV) {
@@ -1263,9 +1277,12 @@ static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd,
// If noise estimation is enabled, and estimated level is above threshold,
// add a bias to LAST reference with small motion, for large blocks.
if (ne->enabled && ne->level >= kMedium && bsize >= BLOCK_32X32 &&
- is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) {
- this_rdc->rdcost = 7 * this_rdc->rdcost >> 3;
- }
+ is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8)
+ this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3);
+ else if (lowvar_highsumdiff && !is_skin && bsize >= BLOCK_16X16 &&
+ is_last_frame && mv_row < 16 && mv_row > -16 && mv_col < 16 &&
+ mv_col > -16)
+ this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3);
}
#if CONFIG_VP9_TEMPORAL_DENOISING
@@ -1465,11 +1482,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
int use_golden_nonzeromv = 1;
int force_skip_low_temp_var = 0;
int skip_ref_find_pred[4] = { 0 };
+ unsigned int sse_zeromv_normalized = UINT_MAX;
+ unsigned int thresh_svc_skip_golden = 500;
#if CONFIG_VP9_TEMPORAL_DENOISING
VP9_PICKMODE_CTX_DEN ctx_den;
int64_t zero_last_cost_orig = INT64_MAX;
int denoise_svc_pickmode = 1;
#endif
+ INTERP_FILTER filter_gf_svc = EIGHTTAP;
init_ref_frame_cost(cm, xd, ref_frame_cost);
@@ -1608,6 +1628,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
int64_t this_sse;
int is_skippable;
int this_early_term = 0;
+ int rd_computed = 0;
+
PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
ref_frame = ref_mode_set[idx].ref_frame;
@@ -1619,6 +1641,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (ref_frame > usable_ref_frame) continue;
if (skip_ref_find_pred[ref_frame]) continue;
+ // For SVC, skip the golden (spatial) reference search if sse of zeromv_last
+ // is below threshold.
+ if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
+ sse_zeromv_normalized < thresh_svc_skip_golden)
+ continue;
+
if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
this_mode != NEARESTMV) {
continue;
@@ -1715,15 +1743,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
&rd_thresh_freq_fact[mode_index])) ||
(!cpi->sf.adaptive_rd_thresh_row_mt &&
- rd_less_than_thresh(
- best_rdc.rdcost, mode_rd_thresh,
-#if CONFIG_MULTITHREAD
- // Synchronization of this function
- // is only necessary when
- // adaptive_rd_thresh is > 0.
- cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL,
-#endif
- &rd_thresh_freq_fact[mode_index])))
+ rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+ &rd_thresh_freq_fact[mode_index])))
continue;
if (this_mode == NEWMV) {
@@ -1835,12 +1856,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
(((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
int pf_rate[3];
int64_t pf_dist[3];
+ int curr_rate[3];
unsigned int pf_var[3];
unsigned int pf_sse[3];
TX_SIZE pf_tx_size[3];
int64_t best_cost = INT64_MAX;
INTERP_FILTER best_filter = SWITCHABLE, filter;
PRED_BUFFER *current_pred = this_mode_pred;
+ rd_computed = 1;
for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) {
int64_t cost;
@@ -1848,6 +1871,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
&pf_var[filter], &pf_sse[filter]);
+ curr_rate[filter] = pf_rate[filter];
pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
pf_tx_size[filter] = mi->tx_size;
@@ -1873,7 +1897,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
mi->interp_filter = best_filter;
mi->tx_size = pf_tx_size[best_filter];
- this_rdc.rate = pf_rate[best_filter];
+ this_rdc.rate = curr_rate[best_filter];
this_rdc.dist = pf_dist[best_filter];
var_y = pf_var[best_filter];
sse_y = pf_sse[best_filter];
@@ -1887,6 +1911,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
? bsize > BLOCK_32X32
: bsize >= BLOCK_32X32;
mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
+
+ if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
+ svc_force_zero_mode[ref_frame - 1])
+ mi->interp_filter = filter_gf_svc;
+
vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
// For large partition blocks, extra testing is done.
@@ -1897,15 +1926,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
&this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
&this_early_term);
} else {
+ rd_computed = 1;
model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
&var_y, &sse_y);
}
+ // Save normalized sse (between current and last frame) for (0, 0) motion.
+ if (cpi->use_svc && ref_frame == LAST_FRAME &&
+ frame_mv[this_mode][ref_frame].as_int == 0) {
+ sse_zeromv_normalized =
+ sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ }
}
if (!this_early_term) {
this_sse = (int64_t)sse_y;
block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize,
- VPXMIN(mi->tx_size, TX_16X16));
+ VPXMIN(mi->tx_size, TX_16X16), rd_computed);
+
x->skip_txfm[0] = is_skippable;
if (is_skippable) {
this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -1956,7 +1993,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
vp9_NEWMV_diff_bias(&cpi->noise_estimate, xd, this_mode, &this_rdc, bsize,
frame_mv[this_mode][ref_frame].as_mv.row,
frame_mv[this_mode][ref_frame].as_mv.col,
- ref_frame == LAST_FRAME);
+ ref_frame == LAST_FRAME, x->lowvar_highsumdiff,
+ x->sb_is_skin);
}
// Skipping checking: test to see if this block can be reconstructed by
@@ -2038,7 +2076,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (best_rdc.rdcost == INT64_MAX ||
((!force_skip_low_temp_var || bsize < BLOCK_32X32) &&
perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh &&
- bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad)) {
+ bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad &&
+ !x->lowvar_highsumdiff)) {
struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
int i;
TX_SIZE best_intra_tx_size = TX_SIZES;
@@ -2053,9 +2092,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth)
- vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
- this_mode_pred->data, this_mode_pred->stride,
- NULL, 0, NULL, 0, bw, bh, xd->bd);
+ vpx_highbd_convolve_copy(
+ CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
+ CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride,
+ NULL, 0, NULL, 0, bw, bh, xd->bd);
else
vpx_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride, NULL,
@@ -2086,15 +2126,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
&rd_thresh_freq_fact[mode_index])) ||
(!cpi->sf.adaptive_rd_thresh_row_mt &&
- rd_less_than_thresh(
- best_rdc.rdcost, mode_rd_thresh,
-#if CONFIG_MULTITHREAD
- // Synchronization of this function
- // is only necessary when
- // adaptive_rd_thresh is > 0.
- cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL,
-#endif
- &rd_thresh_freq_fact[mode_index])))
+ rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+ &rd_thresh_freq_fact[mode_index])))
continue;
mi->mode = this_mode;
@@ -2162,9 +2195,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth)
- vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
- pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0,
- bw, bh, xd->bd);
+ vpx_highbd_convolve_copy(
+ CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
+ CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, NULL, 0,
+ bw, bh, xd->bd);
else
vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
@@ -2407,7 +2441,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
vp9_highbd_build_inter_predictor(
- pd->pre[0].buf, pd->pre[0].stride, pd->dst.buf, pd->dst.stride,
+ CONVERT_TO_SHORTPTR(pd->pre[0].buf), pd->pre[0].stride,
+ CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride,
&xd->mi[0]->bmi[i].as_mv[0].as_mv, &xd->block_refs[0]->sf,
4 * num_4x4_blocks_wide, 4 * num_4x4_blocks_high, 0,
vp9_filter_kernels[mi->interp_filter], MV_PRECISION_Q3,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
index f79b7c6fc27..27fea5d4e78 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -547,6 +547,7 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
int active_best_quality, int active_worst_quality) {
const VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
int q = active_worst_quality;
int last_error = INT_MAX;
int i, target_bits_per_mb, bits_per_mb_at_this_q;
@@ -561,7 +562,7 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
do {
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
- cpi->svc.temporal_layer_id == 0 &&
+ cr->apply_cyclic_refresh &&
(!cpi->oxcf.gf_cbr_boost_pct || !cpi->refresh_golden_frame)) {
bits_per_mb_at_this_q =
(int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
@@ -2172,6 +2173,11 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
if (rate_err < 2.0 && !high_content) {
rc->fac_active_worst_inter = 120;
rc->fac_active_worst_gf = 90;
+ } else if (rate_err > 8.0 && rc->avg_frame_qindex[INTER_FRAME] < 16) {
+ // Increase active_worst faster at low Q if rate fluctuation is high.
+ rc->fac_active_worst_inter = 200;
+ if (rc->avg_frame_qindex[INTER_FRAME] < 8)
+ rc->fac_active_worst_inter = 400;
}
if (low_content && rc->avg_frame_low_motion > 80) {
rc->af_ratio_onepass_vbr = 15;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c
index 3c49fe665d4..39a7742f0f4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c
@@ -650,15 +650,7 @@ void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
}
void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
- int bsize,
-#if CONFIG_MULTITHREAD
- pthread_mutex_t *enc_row_mt_mutex,
-#endif
- int best_mode_index) {
-#if CONFIG_MULTITHREAD
- if (NULL != enc_row_mt_mutex) pthread_mutex_lock(enc_row_mt_mutex);
-#endif
-
+ int bsize, int best_mode_index) {
if (rd_thresh > 0) {
const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
int mode;
@@ -676,10 +668,6 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
}
}
}
-
-#if CONFIG_MULTITHREAD
- if (NULL != enc_row_mt_mutex) pthread_mutex_unlock(enc_row_mt_mutex);
-#endif
}
int vp9_get_intra_cost_penalty(int qindex, int qdelta,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h
index aae47dcdda4..1e117686676 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h
@@ -170,32 +170,11 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize,
-#if CONFIG_MULTITHREAD
- pthread_mutex_t *enc_row_mt_mutex,
-#endif
int best_mode_index);
static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
-#if CONFIG_MULTITHREAD
- pthread_mutex_t *enc_row_mt_mutex,
-#endif
const int *const thresh_fact) {
- int is_rd_less_than_thresh;
-
-#if CONFIG_MULTITHREAD
- // Synchronize to ensure data coherency as thresh_freq_fact is maintained at
- // tile level and not thread-safe with row based multi-threading
- if (NULL != enc_row_mt_mutex) pthread_mutex_lock(enc_row_mt_mutex);
-#endif
-
- is_rd_less_than_thresh =
- best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
-
-#if CONFIG_MULTITHREAD
- if (NULL != enc_row_mt_mutex) pthread_mutex_unlock(enc_row_mt_mutex);
-#endif
-
- return is_rd_less_than_thresh;
+ return best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
}
static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
index d23d324466d..bf0fec3d8d8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
@@ -599,28 +599,28 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- recon = CONVERT_TO_BYTEPTR(recon);
- vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0,
- bs, bs, xd->bd);
+ vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
+ 32, NULL, 0, NULL, 0, bs, bs, xd->bd);
if (xd->lossless) {
- vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
+ vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
} else {
switch (tx_size) {
case TX_4X4:
- vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
+ vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
case TX_8X8:
- vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd);
+ vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
case TX_16X16:
- vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd);
+ vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
case TX_32X32:
- vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd);
+ vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
default: assert(0 && "Invalid transform size");
}
}
+ recon = CONVERT_TO_BYTEPTR(recon16);
} else {
#endif // CONFIG_VP9_HIGHBITDEPTH
vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);
@@ -1004,6 +1004,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
const int block = (row + idy) * 2 + (col + idx);
const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
int16_t *const src_diff =
vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
@@ -1025,7 +1026,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
goto next_highbd;
- vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst,
+ vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16,
dst_stride, p->eobs[block], xd->bd);
} else {
int64_t unused;
@@ -1048,7 +1049,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
goto next_highbd;
vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
- dst, dst_stride, p->eobs[block], xd->bd);
+ dst16, dst_stride, p->eobs[block], xd->bd);
}
}
}
@@ -1528,7 +1529,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
vp9_highbd_build_inter_predictor(
- pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv,
+ CONVERT_TO_SHORTPTR(pre), y_stride, CONVERT_TO_SHORTPTR(dst),
+ pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv,
&xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3,
mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2),
xd->bd);
@@ -1783,9 +1785,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
vp9_highbd_build_inter_predictor(
- ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
- &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, kernel, MV_PRECISION_Q3,
- mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd);
+ CONVERT_TO_SHORTPTR(ref_yv12[!id].buf), ref_yv12[!id].stride,
+ second_pred_alloc_16, pw, &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0,
+ kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd);
} else {
second_pred = (uint8_t *)second_pred_alloc_16;
vp9_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
@@ -3160,11 +3162,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
-#if CONFIG_MULTITHREAD
- if (NULL != tile_data->enc_row_mt_mutex)
- pthread_mutex_lock(tile_data->enc_row_mt_mutex);
-#endif
-
for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
@@ -3186,11 +3183,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
memcpy(mode_map, tile_mode_map, sizeof(mode_map));
-#if CONFIG_MULTITHREAD
- if (NULL != tile_data->enc_row_mt_mutex)
- pthread_mutex_unlock(tile_data->enc_row_mt_mutex);
-#endif
-
for (midx = 0; midx < MAX_MODES; ++midx) {
int mode_index = mode_map[midx];
int mode_excluded = 0;
@@ -3627,11 +3619,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
if (!cpi->rc.is_src_frame_alt_ref)
vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
- sf->adaptive_rd_thresh, bsize,
-#if CONFIG_MULTITHREAD
- tile_data->enc_row_mt_mutex,
-#endif
- best_mode_index);
+ sf->adaptive_rd_thresh, bsize, best_mode_index);
// macroblock modes
*mi = best_mbmode;
@@ -3771,11 +3759,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data,
(cm->interp_filter == mi->interp_filter));
vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
- cpi->sf.adaptive_rd_thresh, bsize,
-#if CONFIG_MULTITHREAD
- tile_data->enc_row_mt_mutex,
-#endif
- THR_ZEROMV);
+ cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
vp9_zero(best_pred_diff);
vp9_zero(best_filter_diff);
@@ -3921,9 +3905,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
if (!internal_active_edge &&
rd_less_than_thresh(best_rd,
rd_opt->threshes[segment_id][bsize][ref_index],
-#if CONFIG_MULTITHREAD
- tile_data->enc_row_mt_mutex,
-#endif
&rd_thresh_freq_fact[ref_index]))
continue;
@@ -4373,11 +4354,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
!is_inter_block(&best_mbmode));
vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, sf->adaptive_rd_thresh,
- bsize,
-#if CONFIG_MULTITHREAD
- tile_data->enc_row_mt_mutex,
-#endif
- best_ref_index);
+ bsize, best_ref_index);
// macroblock modes
*mi = best_mbmode;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
index f74b6b0e9e3..8d9e2e8c37f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
@@ -20,19 +20,14 @@ static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = {
{ 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 }
};
-#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method
+// Define 3 mesh density levels to control the number of searches.
+#define MESH_DENSITY_LEVELS 3
static MESH_PATTERN
- good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
- { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+ good_quality_mesh_patterns[MESH_DENSITY_LEVELS][MAX_MESH_STEP] = {
{ { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
{ { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
{ { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
- { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
- { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
};
-static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = {
- 50, 25, 15, 5, 1, 1
-};
// Intra only frames, golden frames (except alt ref overlays) and
// alt ref frames tend to be coded at a higher than ambient quality
@@ -163,14 +158,29 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
SPEED_FEATURES *sf,
int speed) {
const int boosted = frame_is_boosted(cpi);
+ int i;
sf->tx_size_search_breakout = 1;
sf->adaptive_rd_thresh = 1;
+ sf->adaptive_rd_thresh_row_mt = 0;
sf->allow_skip_recode = 1;
sf->less_rectangular_check = 1;
sf->use_square_partition_only = !frame_is_boosted(cpi);
sf->use_square_only_threshold = BLOCK_16X16;
+ if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+ sf->exhaustive_searches_thresh = (1 << 22);
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ int mesh_density_level = 0;
+ sf->mesh_patterns[i].range =
+ good_quality_mesh_patterns[mesh_density_level][i].range;
+ sf->mesh_patterns[i].interval =
+ good_quality_mesh_patterns[mesh_density_level][i].interval;
+ }
+ } else {
+ sf->exhaustive_searches_thresh = INT_MAX;
+ }
+
if (speed >= 1) {
if (cpi->oxcf.pass == 2) {
TWO_PASS *const twopass = &cpi->twopass;
@@ -208,6 +218,10 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
sf->recode_tolerance_low = 15;
sf->recode_tolerance_high = 30;
+
+ sf->exhaustive_searches_thresh =
+ (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23)
+ : INT_MAX;
}
if (speed >= 2) {
@@ -229,6 +243,16 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
sf->allow_partition_search_skip = 1;
sf->recode_tolerance_low = 15;
sf->recode_tolerance_high = 45;
+
+ if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ int mesh_density_level = 1;
+ sf->mesh_patterns[i].range =
+ good_quality_mesh_patterns[mesh_density_level][i].range;
+ sf->mesh_patterns[i].interval =
+ good_quality_mesh_patterns[mesh_density_level][i].interval;
+ }
+ }
}
if (speed >= 3) {
@@ -247,6 +271,16 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
sf->adaptive_interp_filter_search = 1;
+
+ if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ int mesh_density_level = 2;
+ sf->mesh_patterns[i].range =
+ good_quality_mesh_patterns[mesh_density_level][i].range;
+ sf->mesh_patterns[i].interval =
+ good_quality_mesh_patterns[mesh_density_level][i].interval;
+ }
+ }
}
if (speed >= 4) {
@@ -325,7 +359,6 @@ static void set_rt_speed_feature_framesize_independent(
sf->adaptive_rd_thresh = 1;
sf->adaptive_rd_thresh_row_mt = 0;
sf->use_fast_coef_costing = 1;
- sf->allow_exhaustive_searches = 0;
sf->exhaustive_searches_thresh = INT_MAX;
sf->allow_acl = 0;
sf->copy_partition_flag = 0;
@@ -498,7 +531,15 @@ static void set_rt_speed_feature_framesize_independent(
// Enable short circuit for low temporal variance.
sf->short_circuit_low_temp_var = 1;
}
- if (cpi->use_svc) sf->base_mv_aggressive = 1;
+ if (cpi->svc.temporal_layer_id > 0) {
+ sf->adaptive_rd_thresh = 4;
+ sf->limit_newmv_early_exit = 0;
+ sf->mv.subpel_force_stop = (cpi->svc.temporal_layer_id == 1) ? 1 : 2;
+ sf->base_mv_aggressive =
+ (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
+ ? 1
+ : 0;
+ }
}
if (speed >= 7) {
@@ -523,9 +564,11 @@ static void set_rt_speed_feature_framesize_independent(
if (speed >= 8) {
sf->adaptive_rd_thresh = 4;
- // Enable partition copy
- if (!cpi->use_svc && !cpi->resize_pending && cpi->resize_state == ORIG &&
- !cpi->external_resize && cpi->oxcf.resize_mode == RESIZE_NONE) {
+ // Enable partition copy. For SVC, only enabled for top resolution layer,
+ if (!cpi->last_frame_dropped && cpi->resize_state == ORIG &&
+ !cpi->external_resize &&
+ (!cpi->use_svc ||
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
sf->copy_partition_flag = 1;
cpi->max_copied_frame = 4;
}
@@ -533,7 +576,11 @@ static void set_rt_speed_feature_framesize_independent(
if (cpi->row_mt && cpi->oxcf.max_threads > 1)
sf->adaptive_rd_thresh_row_mt = 1;
- sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
+ if (content == VP9E_CONTENT_SCREEN)
+ sf->mv.subpel_force_stop = 3;
+ else if (cm->width * cm->height > 352 * 288)
+ sf->mv.subpel_force_stop = 2;
+
if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
// Only keep INTRA_DC mode for speed 8.
if (!is_keyframe) {
@@ -555,18 +602,13 @@ static void set_rt_speed_feature_framesize_independent(
}
// Since the short_circuit_low_temp_var is used, reduce the
// adaptive_rd_thresh level.
- if (cm->width > 320 && cm->height > 240)
+ if (cm->width * cm->height > 352 * 288)
sf->adaptive_rd_thresh = 1;
else
sf->adaptive_rd_thresh = 2;
}
sf->limit_newmv_early_exit = 0;
- if (cm->width > 320 && cm->height > 240) sf->use_simple_block_yrd = 1;
- }
- // Turn off adaptive_rd_thresh if row_mt is on for speed 5, 6, 7.
- if (speed >= 5 && speed < 8 && cpi->row_mt && cpi->num_workers > 1) {
- sf->adaptive_rd_thresh = 0;
- sf->adaptive_rd_thresh_row_mt = 0;
+ sf->use_simple_block_yrd = 1;
}
}
@@ -606,12 +648,11 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
// With row based multi-threading, the following speed features
// have to be disabled to guarantee that bitstreams encoded with single thread
- // and multiple threads match
- if (cpi->oxcf.row_mt_bit_exact) {
+ // and multiple threads match.
+ // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
+ // adaptive_rd_thresh is defined per-row for non-rd pickmode.
+ if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact)
sf->adaptive_rd_thresh = 0;
- sf->allow_exhaustive_searches = 0;
- sf->adaptive_pred_interp_filter = 0;
- }
// This is only used in motion vector unit test.
if (cpi->oxcf.motion_vector_unit_test == 1)
@@ -711,6 +752,16 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
sf->adaptive_rd_thresh = 1;
sf->tx_size_search_breakout = 1;
+ sf->exhaustive_searches_thresh =
+ (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
+ : INT_MAX;
+ if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
+ sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+ }
+ }
+
if (oxcf->mode == REALTIME)
set_rt_speed_feature_framesize_independent(cpi, sf, oxcf->speed,
oxcf->content);
@@ -720,34 +771,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
cpi->full_search_sad = vp9_full_search_sad;
cpi->diamond_search_sad = vp9_diamond_search_sad;
- sf->allow_exhaustive_searches = 1;
- if (oxcf->mode == BEST) {
- if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
- sf->exhaustive_searches_thresh = (1 << 20);
- else
- sf->exhaustive_searches_thresh = (1 << 21);
- sf->max_exaustive_pct = 100;
- for (i = 0; i < MAX_MESH_STEP; ++i) {
- sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
- sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
- }
- } else {
- int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
- if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
- sf->exhaustive_searches_thresh = (1 << 22);
- else
- sf->exhaustive_searches_thresh = (1 << 23);
- sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
- if (speed > 0)
- sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
-
- for (i = 0; i < MAX_MESH_STEP; ++i) {
- sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range;
- sf->mesh_patterns[i].interval =
- good_quality_mesh_patterns[speed][i].interval;
- }
- }
-
// Slow quant, dct and trellis not worthwhile for first pass
// so make sure they are always turned off.
if (oxcf->pass == 1) sf->optimize_coefficients = 0;
@@ -782,12 +805,11 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
// With row based multi-threading, the following speed features
// have to be disabled to guarantee that bitstreams encoded with single thread
- // and multiple threads match
- if (cpi->oxcf.row_mt_bit_exact) {
+ // and multiple threads match.
+ // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
+ // adaptive_rd_thresh is defined per-row for non-rd pickmode.
+ if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact)
sf->adaptive_rd_thresh = 0;
- sf->allow_exhaustive_searches = 0;
- sf->adaptive_pred_interp_filter = 0;
- }
// This is only used in motion vector unit test.
if (cpi->oxcf.motion_vector_unit_test == 1)
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
index cbdf8bc3090..ee485a35f4d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
@@ -231,9 +231,11 @@ typedef struct SPEED_FEATURES {
// This variable is used to cap the maximum number of times we skip testing a
// mode to be evaluated. A high value means we will be faster.
+ // Turned off when (row_mt_bit_exact == 1 && adaptive_rd_thresh_row_mt == 0).
int adaptive_rd_thresh;
- // Flag to use adaptive_rd_thresh when row-mt it enabled.
+ // Flag to use adaptive_rd_thresh when row-mt it enabled, only for non-rd
+ // pickmode.
int adaptive_rd_thresh_row_mt;
// Enables skipping the reconstruction step (idct, recon) in the
@@ -325,15 +327,9 @@ typedef struct SPEED_FEATURES {
// point for this motion search and limits the search range around it.
int adaptive_motion_search;
- // Flag for allowing some use of exhaustive searches;
- int allow_exhaustive_searches;
-
// Threshold for allowing exhaistive motion search.
int exhaustive_searches_thresh;
- // Maximum number of exhaustive searches for a frame.
- int max_exaustive_pct;
-
// Pattern to be used for any exhaustive mesh searches.
MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
index 1d892dc148b..5867a6c38b8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -38,10 +38,12 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
svc->current_superframe = 0;
for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
- cpi->svc.ext_frame_flags[sl] = 0;
- cpi->svc.ext_lst_fb_idx[sl] = 0;
- cpi->svc.ext_gld_fb_idx[sl] = 1;
- cpi->svc.ext_alt_fb_idx[sl] = 2;
+ svc->ext_frame_flags[sl] = 0;
+ svc->ext_lst_fb_idx[sl] = 0;
+ svc->ext_gld_fb_idx[sl] = 1;
+ svc->ext_alt_fb_idx[sl] = 2;
+ svc->downsample_filter_type[sl] = EIGHTTAP;
+ svc->downsample_filter_phase[sl] = 0; // Set to 8 for averaging filter.
}
if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
@@ -650,15 +652,25 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
lc->scaling_factor_num, lc->scaling_factor_den, &width,
&height);
+ // For low resolutions: set phase of the filter = 8 (for symmetric averaging
+ // filter), use bilinear for now.
+ if (width <= 320 && height <= 240) {
+ cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR;
+ cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8;
+ }
+
// The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use
- // of base motion vectors if spatial scale factors for any layers are not 2.
+ // of base motion vectors if spatial scale factors for any layers are not 2,
+ // keep the case of 3 spatial layers with scale factor of 4x4 for base layer.
// TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2.
if (cpi->svc.number_spatial_layers > 1) {
int sl;
for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) {
lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers +
cpi->svc.temporal_layer_id];
- if (lc->scaling_factor_num != lc->scaling_factor_den >> 1) {
+ if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) &&
+ !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 &&
+ cpi->svc.number_spatial_layers == 3)) {
cpi->svc.use_base_mv = 0;
break;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h
index ee7a6638b42..d8e6772b26f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -88,6 +88,13 @@ typedef struct {
int force_zero_mode_spatial_ref;
int current_superframe;
int use_base_mv;
+ // Used to control the downscaling filter for source scaling, for 1 pass CBR.
+ // downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+ // = 8 will center the target pixel and get a symmetric averaging filter.
+ // downsample_filter_type: 4 filters may be used: eighttap_regular,
+ // eighttap_smooth, eighttap_sharp, and bilinear.
+ INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS];
+ int downsample_filter_phase[VPX_SS_MAX_LAYERS];
} SVC;
struct VP9_COMP;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
index 2b0307f8a11..63079415617 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -8,10 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include <math.h>
#include <limits.h>
#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_quant_common.h"
#include "vp9/common/vp9_reconinter.h"
@@ -53,16 +55,19 @@ static void temporal_filter_predictors_mb_c(
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale,
- 16, 16, which_mv, kernel, MV_PRECISION_Q3,
- x, y, xd->bd);
+ vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride,
+ CONVERT_TO_SHORTPTR(&pred[0]), 16, &mv,
+ scale, 16, 16, which_mv, kernel,
+ MV_PRECISION_Q3, x, y, xd->bd);
- vp9_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256],
+ vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride,
+ CONVERT_TO_SHORTPTR(&pred[256]),
uv_block_width, &mv, scale, uv_block_width,
uv_block_height, which_mv, kernel,
mv_precision_uv, x, y, xd->bd);
- vp9_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512],
+ vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride,
+ CONVERT_TO_SHORTPTR(&pred[512]),
uv_block_width, &mv, scale, uv_block_width,
uv_block_height, which_mv, kernel,
mv_precision_uv, x, y, xd->bd);
@@ -93,13 +98,19 @@ void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride,
const uint8_t *frame2,
unsigned int block_width,
unsigned int block_height, int strength,
- int filter_weight, unsigned int *accumulator,
+ int filter_weight, uint32_t *accumulator,
uint16_t *count) {
unsigned int i, j, k;
int modifier;
int byte = 0;
const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+ assert(strength >= 0);
+ assert(strength <= 6);
+
+ assert(filter_weight >= 0);
+ assert(filter_weight <= 2);
+
for (i = 0, k = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++, k++) {
int pixel_value = *frame2;
@@ -155,7 +166,7 @@ void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride,
void vp9_highbd_temporal_filter_apply_c(
const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8,
unsigned int block_width, unsigned int block_height, int strength,
- int filter_weight, unsigned int *accumulator, uint16_t *count) {
+ int filter_weight, uint32_t *accumulator, uint16_t *count) {
const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
unsigned int i, j, k;
@@ -285,7 +296,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
unsigned int filter_weight;
int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
- DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
+ DECLARE_ALIGNED(16, uint32_t, accumulator[16 * 16 * 3]);
DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
MACROBLOCKD *mbd = &td->mb.e_mbd;
YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
@@ -332,8 +343,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
int stride;
MV ref_mv;
- memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
- memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+ vp9_zero_array(accumulator, 16 * 16 * 3);
+ vp9_zero_array(count, 16 * 16 * 3);
td->mb.mv_limits.col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
td->mb.mv_limits.col_max =
@@ -376,45 +387,44 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int adj_strength = strength + 2 * (mbd->bd - 8);
// Apply the filter (YUV)
- vp9_highbd_temporal_filter_apply_c(
+ vp9_highbd_temporal_filter_apply(
f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
adj_strength, filter_weight, accumulator, count);
- vp9_highbd_temporal_filter_apply_c(
+ vp9_highbd_temporal_filter_apply(
f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
mb_uv_width, mb_uv_height, adj_strength, filter_weight,
accumulator + 256, count + 256);
- vp9_highbd_temporal_filter_apply_c(
+ vp9_highbd_temporal_filter_apply(
f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
mb_uv_width, mb_uv_height, adj_strength, filter_weight,
accumulator + 512, count + 512);
} else {
// Apply the filter (YUV)
- vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
- predictor, 16, 16, strength,
- filter_weight, accumulator, count);
- vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
- predictor + 256, mb_uv_width,
- mb_uv_height, strength, filter_weight,
- accumulator + 256, count + 256);
- vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
- predictor + 512, mb_uv_width,
- mb_uv_height, strength, filter_weight,
- accumulator + 512, count + 512);
- }
-#else
- // Apply the filter (YUV)
- // TODO(jingning): Need SIMD optimization for this.
- vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+ vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, 16, strength, filter_weight,
accumulator, count);
- vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
+ vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
predictor + 256, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 256,
count + 256);
- vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
+ vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
predictor + 512, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 512,
count + 512);
+ }
+#else
+ // Apply the filter (YUV)
+ vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+ predictor, 16, 16, strength, filter_weight,
+ accumulator, count);
+ vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+ predictor + 256, mb_uv_width, mb_uv_height,
+ strength, filter_weight, accumulator + 256,
+ count + 256);
+ vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+ predictor + 512, mb_uv_width, mb_uv_height,
+ strength, filter_weight, accumulator + 512,
+ count + 512);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
@@ -745,7 +755,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
"Failed to reallocate alt_ref_buffer");
}
frames[frame] = vp9_scale_if_required(
- cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0);
+ cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0,
+ EIGHTTAP, 0);
++frame_used;
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
new file mode 100644
index 00000000000..be4cd8685c5
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
+// which may be bound by the edges of the block being filtered.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+
+// Load values from 'a' and 'b'. Compute the difference squared and sum
+// neighboring values such that:
+// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
+// Values to the left and right of the row are set to 0.
+// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
+static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
+ const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);
+
+ const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
+ const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);
+
+ const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
+ const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);
+
+ // Shift all the values one place to the left/right so we can efficiently sum
+ // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
+ const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
+ const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);
+
+ // It becomes necessary to treat the values as unsigned at this point. The
+ // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
+ // forward since the filter is only applied to smooth small pixel changes.
+ // Once the value has saturated to uint16_t it is well outside the useful
+ // range.
+ __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
+ sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+ *sum = sum_u16;
+}
+
+static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
+ __m128i *sum_1) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
+ const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);
+
+ const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
+ const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
+ const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
+ const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);
+
+ const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
+ const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
+ const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
+ const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);
+
+ __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
+ // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
+ __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);
+
+ __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
+ sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+ *sum_0 = sum_u16;
+
+ shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
+ shift_right = _mm_srli_si128(diff_sq_1_u16, 2);
+
+ sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
+ sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+ *sum_1 = sum_u16;
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static __m128i average_8(__m128i sum, const __m128i mul_constants,
+ const int strength, const int rounding,
+ const int weight) {
+ // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+ const __m128i weight_u16 = _mm_set1_epi16(weight);
+ const __m128i sixteen = _mm_set1_epi16(16);
+
+ // modifier * 3 / index;
+ sum = _mm_mulhi_epu16(sum, mul_constants);
+
+ sum = _mm_adds_epu16(sum, rounding_u16);
+ sum = _mm_srl_epi16(sum, strength_u128);
+
+ // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+ // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+ // So this needs to use the epu16 version which did not come until SSE4.
+ sum = _mm_min_epu16(sum, sixteen);
+
+ sum = _mm_sub_epi16(sixteen, sum);
+
+ return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
+ const __m128i mul_constants_0,
+ const __m128i mul_constants_1, const int strength,
+ const int rounding, const int weight) {
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+ const __m128i weight_u16 = _mm_set1_epi16(weight);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ __m128i input_0, input_1;
+
+ input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
+ input_0 = _mm_adds_epu16(input_0, rounding_u16);
+
+ input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
+ input_1 = _mm_adds_epu16(input_1, rounding_u16);
+
+ input_0 = _mm_srl_epi16(input_0, strength_u128);
+ input_1 = _mm_srl_epi16(input_1, strength_u128);
+
+ input_0 = _mm_min_epu16(input_0, sixteen);
+ input_1 = _mm_min_epu16(input_1, sixteen);
+ input_0 = _mm_sub_epi16(sixteen, input_0);
+ input_1 = _mm_sub_epi16(sixteen, input_1);
+
+ *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
+ *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
+ uint16_t *count, uint32_t *accumulator) {
+ const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+ __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
+ __m128i pred_0_u32, pred_1_u32;
+ __m128i accum_0_u32, accum_1_u32;
+
+ count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+ _mm_storeu_si128((__m128i *)count, count_u16);
+
+ pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static void accumulate_and_store_16(const __m128i sum_0_u16,
+ const __m128i sum_1_u16,
+ const uint8_t *pred, uint16_t *count,
+ uint32_t *accumulator) {
+ const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
+ count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
+ __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
+ pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
+ __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
+ __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+ count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
+ _mm_storeu_si128((__m128i *)count, count_0_u16);
+
+ count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
+ _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
+
+ pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
+ pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
+ pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
+ pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+ accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
+ accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+ accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
+ accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
+}
+
+void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
+ const uint8_t *b, unsigned int width,
+ unsigned int height, int strength,
+ int weight, uint32_t *accumulator,
+ uint16_t *count) {
+ unsigned int h;
+ const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+ assert(strength >= 0);
+ assert(strength <= 6);
+
+ assert(weight >= 0);
+ assert(weight <= 2);
+
+ assert(width == 8 || width == 16);
+
+ if (width == 8) {
+ __m128i sum_row_a, sum_row_b, sum_row_c;
+ __m128i mul_constants = _mm_setr_epi16(
+ NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+
+ sum_8(a, b, &sum_row_a);
+ sum_8(a + stride, b + width, &sum_row_b);
+ sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
+ sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
+ accumulate_and_store_8(sum_row_c, b, count, accumulator);
+
+ a += stride + stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
+
+ for (h = 0; h < height - 2; ++h) {
+ sum_8(a, b + width, &sum_row_c);
+ sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
+ sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
+ sum_row_a =
+ average_8(sum_row_a, mul_constants, strength, rounding, weight);
+ accumulate_and_store_8(sum_row_a, b, count, accumulator);
+
+ a += stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ sum_row_a = sum_row_b;
+ sum_row_b = sum_row_c;
+ }
+
+ mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+ sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
+ sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
+ accumulate_and_store_8(sum_row_a, b, count, accumulator);
+
+ } else { // width == 16
+ __m128i sum_row_a_0, sum_row_a_1;
+ __m128i sum_row_b_0, sum_row_b_1;
+ __m128i sum_row_c_0, sum_row_c_1;
+ __m128i mul_constants_0 = _mm_setr_epi16(
+ NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
+ mul_constants_1 = _mm_setr_epi16(
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+
+ sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
+ sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);
+
+ sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+ sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+
+ average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+ strength, rounding, weight);
+ accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
+
+ a += stride + stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
+ mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+ NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
+ for (h = 0; h < height - 2; ++h) {
+ sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);
+
+ sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+ sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
+ sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+ sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);
+
+ average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
+ strength, rounding, weight);
+ accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);
+
+ a += stride;
+ b += width;
+ count += width;
+ accumulator += width;
+
+ sum_row_a_0 = sum_row_b_0;
+ sum_row_a_1 = sum_row_b_1;
+ sum_row_b_0 = sum_row_c_0;
+ sum_row_b_1 = sum_row_c_1;
+ }
+
+ mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
+ mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+ sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+ sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+
+ average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+ strength, rounding, weight);
+ accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_avx2.c
new file mode 100644
index 00000000000..e228bd8b7fa
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_avx2.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Usee of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+
+int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ __m256i sse_256, ssz_256;
+ __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+ __m256i sse_hi, ssz_hi;
+ __m128i sse_128, ssz_128;
+ int64_t sse;
+ const __m256i zero = _mm256_setzero_si256();
+
+ // If the block size is 16 then the results will fit in 32 bits.
+ if (block_size == 16) {
+ __m256i coeff_256, dqcoeff_256, coeff_hi, dqcoeff_hi;
+ // Load 16 elements for coeff and dqcoeff.
+ coeff_256 = load_tran_low(coeff);
+ dqcoeff_256 = load_tran_low(dqcoeff);
+ // dqcoeff - coeff
+ dqcoeff_256 = _mm256_sub_epi16(dqcoeff_256, coeff_256);
+ // madd (dqcoeff - coeff)
+ dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256);
+ // madd coeff
+ coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256);
+ // Save the higher 64 bit of each 128 bit lane.
+ dqcoeff_hi = _mm256_srli_si256(dqcoeff_256, 8);
+ coeff_hi = _mm256_srli_si256(coeff_256, 8);
+ // Add the higher 64 bit to the low 64 bit.
+ dqcoeff_256 = _mm256_add_epi32(dqcoeff_256, dqcoeff_hi);
+ coeff_256 = _mm256_add_epi32(coeff_256, coeff_hi);
+ // Expand each double word in the lower 64 bits to quad word.
+ sse_256 = _mm256_unpacklo_epi32(dqcoeff_256, zero);
+ ssz_256 = _mm256_unpacklo_epi32(coeff_256, zero);
+ } else {
+ int i;
+ assert(block_size % 32 == 0);
+ sse_256 = zero;
+ ssz_256 = zero;
+
+ for (i = 0; i < block_size; i += 32) {
+ __m256i coeff_0, coeff_1, dqcoeff_0, dqcoeff_1;
+ // Load 32 elements for coeff and dqcoeff.
+ coeff_0 = load_tran_low(coeff + i);
+ dqcoeff_0 = load_tran_low(dqcoeff + i);
+ coeff_1 = load_tran_low(coeff + i + 16);
+ dqcoeff_1 = load_tran_low(dqcoeff + i + 16);
+ // dqcoeff - coeff
+ dqcoeff_0 = _mm256_sub_epi16(dqcoeff_0, coeff_0);
+ dqcoeff_1 = _mm256_sub_epi16(dqcoeff_1, coeff_1);
+ // madd (dqcoeff - coeff)
+ dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0);
+ dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1);
+ // madd coeff
+ coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0);
+ coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1);
+ // Add the first madd (dqcoeff - coeff) with the second.
+ dqcoeff_0 = _mm256_add_epi32(dqcoeff_0, dqcoeff_1);
+ // Add the first madd (coeff) with the second.
+ coeff_0 = _mm256_add_epi32(coeff_0, coeff_1);
+ // Expand each double word of madd (dqcoeff - coeff) to quad word.
+ exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_0, zero);
+ exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_0, zero);
+ // expand each double word of madd (coeff) to quad word
+ exp_coeff_lo = _mm256_unpacklo_epi32(coeff_0, zero);
+ exp_coeff_hi = _mm256_unpackhi_epi32(coeff_0, zero);
+ // Add each quad word of madd (dqcoeff - coeff) and madd (coeff).
+ sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_lo);
+ ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_lo);
+ sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_hi);
+ ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_hi);
+ }
+ }
+ // Save the higher 64 bit of each 128 bit lane.
+ sse_hi = _mm256_srli_si256(sse_256, 8);
+ ssz_hi = _mm256_srli_si256(ssz_256, 8);
+ // Add the higher 64 bit to the low 64 bit.
+ sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+ ssz_256 = _mm256_add_epi64(ssz_256, ssz_hi);
+
+ // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+ sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+ _mm256_extractf128_si256(sse_256, 1));
+
+ ssz_128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_256),
+ _mm256_extractf128_si256(ssz_256, 1));
+
+ // Store the results.
+ _mm_storel_epi64((__m128i *)(&sse), sse_128);
+
+ _mm_storel_epi64((__m128i *)(ssz), ssz_128);
+ return sse;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
deleted file mode 100644
index e39027f2536..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Usee of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h> // AVX2
-
-#include "./vp9_rtcd.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
-
-int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
- intptr_t block_size, int64_t *ssz) {
- __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
- __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
- __m256i sse_reg_64hi, ssz_reg_64hi;
- __m128i sse_reg128, ssz_reg128;
- int64_t sse;
- int i;
- const __m256i zero_reg = _mm256_set1_epi16(0);
-
- // init sse and ssz registerd to zero
- sse_reg = _mm256_set1_epi16(0);
- ssz_reg = _mm256_set1_epi16(0);
-
- for (i = 0; i < block_size; i += 16) {
- // load 32 bytes from coeff and dqcoeff
- coeff_reg = load_tran_low(coeff + i);
- dqcoeff_reg = load_tran_low(dqcoeff + i);
- // dqcoeff - coeff
- dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
- // madd (dqcoeff - coeff)
- dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
- // madd coeff
- coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
- // expand each double word of madd (dqcoeff - coeff) to quad word
- exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
- exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
- // expand each double word of madd (coeff) to quad word
- exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
- exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
- // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
- sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
- ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
- sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
- ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
- }
- // save the higher 64 bit of each 128 bit lane
- sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
- ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
- // add the higher 64 bit to the low 64 bit
- sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
- ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
-
- // add each 64 bit from each of the 128 bit lane of the 256 bit
- sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
- _mm256_extractf128_si256(sse_reg, 1));
-
- ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
- _mm256_extractf128_si256(ssz_reg, 1));
-
- // store the results
- _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
-
- _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
- return sse;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
index 0a472ec7402..11d473b2dfa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
@@ -39,23 +39,18 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
+ paddd m2, m3
; accumulate in 64bit
punpckldq m7, m0, m5
punpckhdq m0, m5
paddq m4, m7
- punpckldq m7, m1, m5
- paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m7
punpckldq m7, m2, m5
- paddq m4, m1
+ paddq m4, m0
punpckhdq m2, m5
paddq m6, m7
- punpckldq m7, m3, m5
paddq m6, m2
- punpckhdq m3, m5
- paddq m6, m7
- paddq m6, m3
jg .loop
; accumulate horizontally and store in return value
@@ -98,15 +93,13 @@ cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
pmaddwd m0, m0
pmaddwd m1, m1
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
; accumulate in 64bit
punpckldq m3, m0, m5
punpckhdq m0, m5
paddq m4, m3
- punpckldq m3, m1, m5
paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m3
- paddq m4, m1
jnz .loop
; accumulate horizontally and store in return value
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index fa2a6449b02..b53714a0289 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -16,7 +16,8 @@
#include "vpx_scale/yv12config.h"
extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst);
+ YV12_BUFFER_CONFIG *dst,
+ uint8_t filter_type, int phase_scaler);
static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, int w,
@@ -168,7 +169,8 @@ static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
}
void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst) {
+ YV12_BUFFER_CONFIG *dst,
+ uint8_t filter_type, int phase_scaler) {
const int src_w = src->y_crop_width;
const int src_h = src->y_crop_height;
const int dst_w = dst->y_crop_width;
@@ -176,7 +178,7 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
const int dst_uv_w = dst_w / 2;
const int dst_uv_h = dst_h / 2;
- if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
+ if (dst_w * 2 == src_w && dst_h * 2 == src_h && phase_scaler == 0) {
downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, dst->y_buffer,
dst->y_stride, dst_w, dst_h);
downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer,
@@ -184,7 +186,7 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer,
dst->uv_stride, dst_uv_w, dst_uv_h);
vpx_extend_frame_borders(dst);
- } else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
+ } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
// The upsample() supports widths up to 1920 * 2. If greater, fall back
// to vp9_scale_and_extend_frame_c().
if (dst_w / 2 <= 1920) {
@@ -196,9 +198,9 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
dst->uv_stride, dst_uv_w, dst_uv_h);
vpx_extend_frame_borders(dst);
} else {
- vp9_scale_and_extend_frame_c(src, dst);
+ vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
}
} else {
- vp9_scale_and_extend_frame_c(src, dst);
+ vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
deleted file mode 100644
index 21aaa938318..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ /dev/null
@@ -1,212 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; void vp9_temporal_filter_apply_sse2 | arg
-; (unsigned char *frame1, | 0
-; unsigned int stride, | 1
-; unsigned char *frame2, | 2
-; unsigned int block_width, | 3
-; unsigned int block_height, | 4
-; int strength, | 5
-; int filter_weight, | 6
-; unsigned int *accumulator, | 7
-; unsigned short *count) | 8
-global sym(vp9_temporal_filter_apply_sse2) PRIVATE
-sym(vp9_temporal_filter_apply_sse2):
-
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ALIGN_STACK 16, rax
- %define block_width 0
- %define block_height 16
- %define strength 32
- %define filter_weight 48
- %define rounding_bit 64
- %define rbp_backup 80
- %define stack_size 96
- sub rsp, stack_size
- mov [rsp + rbp_backup], rbp
- ; end prolog
-
- mov edx, arg(3)
- mov [rsp + block_width], rdx
- mov edx, arg(4)
- mov [rsp + block_height], rdx
- movd xmm6, arg(5)
- movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
-
- ; calculate the rounding bit outside the loop
- ; 0x8000 >> (16 - strength)
- mov rdx, 16
- sub rdx, arg(5) ; 16 - strength
- movq xmm4, rdx ; can't use rdx w/ shift
- movdqa xmm5, [GLOBAL(_const_top_bit)]
- psrlw xmm5, xmm4
- movdqa [rsp + rounding_bit], xmm5
-
- mov rsi, arg(0) ; src/frame1
- mov rdx, arg(2) ; predictor frame
- mov rdi, arg(7) ; accumulator
- mov rax, arg(8) ; count
-
- ; dup the filter weight and store for later
- movd xmm0, arg(6) ; filter_weight
- pshuflw xmm0, xmm0, 0
- punpcklwd xmm0, xmm0
- movdqa [rsp + filter_weight], xmm0
-
- mov rbp, arg(1) ; stride
- pxor xmm7, xmm7 ; zero for extraction
-
- mov rcx, [rsp + block_width]
- imul rcx, [rsp + block_height]
- add rcx, rdx
- cmp dword ptr [rsp + block_width], 8
- jne .temporal_filter_apply_load_16
-
-.temporal_filter_apply_load_8:
- movq xmm0, [rsi] ; first row
- lea rsi, [rsi + rbp] ; += stride
- punpcklbw xmm0, xmm7 ; src[ 0- 7]
- movq xmm1, [rsi] ; second row
- lea rsi, [rsi + rbp] ; += stride
- punpcklbw xmm1, xmm7 ; src[ 8-15]
- jmp .temporal_filter_apply_load_finished
-
-.temporal_filter_apply_load_16:
- movdqa xmm0, [rsi] ; src (frame1)
- lea rsi, [rsi + rbp] ; += stride
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7 ; src[ 0- 7]
- punpckhbw xmm1, xmm7 ; src[ 8-15]
-
-.temporal_filter_apply_load_finished:
- movdqa xmm2, [rdx] ; predictor (frame2)
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm7 ; pred[ 0- 7]
- punpckhbw xmm3, xmm7 ; pred[ 8-15]
-
- ; modifier = src_byte - pixel_value
- psubw xmm0, xmm2 ; src - pred[ 0- 7]
- psubw xmm1, xmm3 ; src - pred[ 8-15]
-
- ; modifier *= modifier
- pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
- pmullw xmm1, xmm1 ; modifer[ 8-15]^2
-
- ; modifier *= 3
- pmullw xmm0, [GLOBAL(_const_3w)]
- pmullw xmm1, [GLOBAL(_const_3w)]
-
- ; modifer += 0x8000 >> (16 - strength)
- paddw xmm0, [rsp + rounding_bit]
- paddw xmm1, [rsp + rounding_bit]
-
- ; modifier >>= strength
- psrlw xmm0, [rsp + strength]
- psrlw xmm1, [rsp + strength]
-
- ; modifier = 16 - modifier
- ; saturation takes care of modifier > 16
- movdqa xmm3, [GLOBAL(_const_16w)]
- movdqa xmm2, [GLOBAL(_const_16w)]
- psubusw xmm3, xmm1
- psubusw xmm2, xmm0
-
- ; modifier *= filter_weight
- pmullw xmm2, [rsp + filter_weight]
- pmullw xmm3, [rsp + filter_weight]
-
- ; count
- movdqa xmm4, [rax]
- movdqa xmm5, [rax+16]
- ; += modifier
- paddw xmm4, xmm2
- paddw xmm5, xmm3
- ; write back
- movdqa [rax], xmm4
- movdqa [rax+16], xmm5
- lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
-
- ; load and extract the predictor up to shorts
- pxor xmm7, xmm7
- movdqa xmm0, [rdx]
- lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7 ; pred[ 0- 7]
- punpckhbw xmm1, xmm7 ; pred[ 8-15]
-
- ; modifier *= pixel_value
- pmullw xmm0, xmm2
- pmullw xmm1, xmm3
-
- ; expand to double words
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm7 ; [ 0- 3]
- punpckhwd xmm2, xmm7 ; [ 4- 7]
- movdqa xmm3, xmm1
- punpcklwd xmm1, xmm7 ; [ 8-11]
- punpckhwd xmm3, xmm7 ; [12-15]
-
- ; accumulator
- movdqa xmm4, [rdi]
- movdqa xmm5, [rdi+16]
- movdqa xmm6, [rdi+32]
- movdqa xmm7, [rdi+48]
- ; += modifier
- paddd xmm4, xmm0
- paddd xmm5, xmm2
- paddd xmm6, xmm1
- paddd xmm7, xmm3
- ; write back
- movdqa [rdi], xmm4
- movdqa [rdi+16], xmm5
- movdqa [rdi+32], xmm6
- movdqa [rdi+48], xmm7
- lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
-
- cmp rdx, rcx
- je .temporal_filter_apply_epilog
- pxor xmm7, xmm7 ; zero for extraction
- cmp dword ptr [rsp + block_width], 16
- je .temporal_filter_apply_load_16
- jmp .temporal_filter_apply_load_8
-
-.temporal_filter_apply_epilog:
- ; begin epilog
- mov rbp, [rsp + rbp_backup]
- add rsp, stack_size
- pop rsp
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-_const_3w:
- times 8 dw 3
-align 16
-_const_top_bit:
- times 8 dw 1<<15
-align 16
-_const_16w
- times 8 dw 16
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
index a335a4ab55d..25fc80a9a1e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
@@ -52,7 +52,6 @@ struct vp9_extracfg {
int render_width;
int render_height;
unsigned int row_mt;
- unsigned int row_mt_bit_exact;
unsigned int motion_vector_unit_test;
};
@@ -86,7 +85,6 @@ static struct vp9_extracfg default_extra_cfg = {
0, // render width
0, // render height
0, // row_mt
- 0, // row_mt_bit_exact
0, // motion_vector_unit_test
};
@@ -252,7 +250,6 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
"or kf_max_dist instead.");
RANGE_CHECK(extra_cfg, row_mt, 0, 1);
- RANGE_CHECK(extra_cfg, row_mt_bit_exact, 0, 1);
RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2);
RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
@@ -564,7 +561,6 @@ static vpx_codec_err_t set_encoder_config(
oxcf->target_level = extra_cfg->target_level;
oxcf->row_mt = extra_cfg->row_mt;
- oxcf->row_mt_bit_exact = extra_cfg->row_mt_bit_exact;
oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
@@ -862,13 +858,6 @@ static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
-static vpx_codec_err_t ctrl_enable_row_mt_bit_exact(vpx_codec_alg_priv_t *ctx,
- va_list args) {
- struct vp9_extracfg extra_cfg = ctx->extra_cfg;
- extra_cfg.row_mt_bit_exact = CAST(VP9E_ENABLE_ROW_MT_BIT_EXACT, args);
- return update_extra_cfg(ctx, &extra_cfg);
-}
-
static vpx_codec_err_t ctrl_enable_motion_vector_unit_test(
vpx_codec_alg_priv_t *ctx, va_list args) {
struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -1633,7 +1622,6 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ VP9E_SET_RENDER_SIZE, ctrl_set_render_size },
{ VP9E_SET_TARGET_LEVEL, ctrl_set_target_level },
{ VP9E_SET_ROW_MT, ctrl_set_row_mt },
- { VP9E_ENABLE_ROW_MT_BIT_EXACT, ctrl_enable_row_mt_bit_exact },
{ VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
// Getters
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk b/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
index e0913bea3e6..47846c9410d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
@@ -100,7 +100,8 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
+
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@@ -120,9 +121,10 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_denoiser_neon.c
endif
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
@@ -135,6 +137,5 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
index c2f80d88515..c774abb34f2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
@@ -436,6 +436,10 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2];
si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2];
}
+ if (svc_ctx->spatial_layers == 1) {
+ si->svc_params.scaling_factor_num[0] = 1;
+ si->svc_params.scaling_factor_den[0] = 1;
+ }
}
for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
index b8ed0bb2e6a..ee6be4a249c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
@@ -555,15 +555,6 @@ enum vp8e_enc_control_id {
*/
VP9E_SET_ROW_MT,
- /*!\brief Codec control function to enable bit-exact bitstream when row level
- * multi-threading is enabled.
- *
- * 0 : off, 1 : on
- *
- * Supported in codecs: VP9
- */
- VP9E_ENABLE_ROW_MT_BIT_EXACT,
-
/*!\brief Codec control function to get bitstream level.
*
* Supported in codecs: VP9
@@ -867,9 +858,6 @@ VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)
VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int)
#define VPX_CTRL_VP9E_SET_ROW_MT
-VPX_CTRL_USE_TYPE(VP9E_ENABLE_ROW_MT_BIT_EXACT, unsigned int)
-#define VPX_CTRL_VP9E_ENABLE_ROW_MT_BIT_EXACT
-
VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
#define VPX_CTRL_VP9E_GET_LEVEL
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c
index cca9a932423..257e8ffee57 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c
@@ -16,6 +16,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
const uint32x4_t a = vpaddlq_u16(v_16x8);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c
new file mode 100644
index 00000000000..fe78f3f5138
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fdct_neon.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ int i;
+ // input[M * stride] * 16
+ int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+ int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+ int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+ int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+ // If the very first value != 0, then add 1.
+ if (input[0] != 0) {
+ const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+ input_0 = vadd_s16(input_0, one);
+ }
+
+ for (i = 0; i < 2; ++i) {
+ const int16x8_t input_01 = vcombine_s16(input_0, input_1);
+ const int16x8_t input_32 = vcombine_s16(input_3, input_2);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // (s_0 +/- s_1) * cospi_16_64
+ // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+ const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+ const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+ const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int16_t)cospi_16_64);
+ const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int16_t)cospi_16_64);
+
+ // fdct_round_shift
+ int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+ int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int16_t)cospi_8_64);
+ const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int16_t)cospi_24_64);
+
+ const int32x4_t temp3 =
+ vmlal_n_s16(s_3_cospi_8_64, s_2, (int16_t)cospi_24_64);
+ const int32x4_t temp4 =
+ vmlsl_n_s16(s_3_cospi_24_64, s_2, (int16_t)cospi_8_64);
+
+ // fdct_round_shift
+ int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+ int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+ transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+ input_0 = out_0;
+ input_1 = out_1;
+ input_2 = out_2;
+ input_3 = out_3;
+ }
+
+ {
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ const int16x8_t one = vdupq_n_s16(1);
+ int16x8_t out_01 = vcombine_s16(input_0, input_1);
+ int16x8_t out_23 = vcombine_s16(input_2, input_3);
+ out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+ out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+ store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+ store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
index 96f6de1be95..c449b466016 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
@@ -14,6 +14,7 @@
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
int stride) {
@@ -125,6 +126,8 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
}
// transpose 8x8
+ // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
+ // columns.
{
// 00 01 02 03 40 41 42 43
// 10 11 12 13 50 51 52 53
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c
index ebeafed31fd..79bedd848a3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c
@@ -13,6 +13,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
index 1259bb3807b..98e42cd25ab 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -1268,10 +1268,8 @@ void vpx_highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
}
}
-void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
if (bd == 8) {
int16_t row_idct_output[16 * 16];
@@ -1313,10 +1311,8 @@ void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
if (bd == 8) {
int16_t row_idct_output[16 * 16];
@@ -1349,10 +1345,8 @@ void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
if (bd == 8) {
int16_t row_idct_output[4 * 16];
@@ -1414,7 +1408,7 @@ static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
*dest += stride;
}
-void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
@@ -1422,7 +1416,6 @@ void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
const int16x8_t dc = vdupq_n_s16(a1);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
int i;
if (a1 >= 0) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
index 858342830d8..96a55c472f6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
@@ -386,15 +386,14 @@ static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out,
}
static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
- uint8_t *const dest,
- const int stride, const int bd) {
+ uint16_t *dst, const int stride,
+ const int bd) {
int i, idct32_pass_loop;
int32_t trans_buf[32 * 8];
int32_t pass1[32 * 32];
int32_t pass2[32 * 32];
int32_t *out;
int32x4x2_t q[16];
- uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
idct32_pass_loop++, input = pass1, out = pass2) {
@@ -637,10 +636,10 @@ static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
}
}
-void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
if (bd == 8) {
- vpx_idct32_32_neon(input, dest, stride, 1);
+ vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1);
} else {
vpx_highbd_idct32_32_neon(input, dest, stride, bd);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
index 52f3d43e5c4..3970a5a8613 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
@@ -726,10 +726,9 @@ static void vpx_highbd_idct32_16_neon(const int32_t *const input,
highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
}
-void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
if (bd == 8) {
int16_t temp[32 * 16];
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
index 195dcc92d5e..5d9063b15dc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
@@ -594,10 +594,9 @@ static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output,
highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
}
-void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
if (bd == 8) {
int16_t temp[32 * 8];
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
index d74331f8031..63eb49678cc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -59,7 +59,7 @@ static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
*dest += stride;
}
-void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
@@ -67,7 +67,6 @@ void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
const int16x8_t dc = vdupq_n_s16(a1);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
int i;
if (a1 >= 0) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
index 128f72b9c96..20b09f68343 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -51,7 +51,7 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
*dest += stride;
}
-void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
const tran_low_t out0 =
@@ -60,7 +60,6 @@ void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
const int16x8_t dc = vdupq_n_s16(a1);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
@@ -133,14 +132,13 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
*a3 = vsubq_s32(b0, b3);
}
-void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
int32x4_t c0 = vld1q_s32(input);
int32x4_t c1 = vld1q_s32(input + 4);
int32x4_t c2 = vld1q_s32(input + 8);
int32x4_t c3 = vld1q_s32(input + 12);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
int16x8_t a0, a1;
if (bd == 8) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
index f53f4c7fcad..6687e764959 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -36,7 +36,7 @@ static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
*dest += stride;
}
-void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
@@ -44,7 +44,6 @@ void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
const int16x8_t dc = vdupq_n_s16(a1);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
if (a1 >= 0) {
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
@@ -292,9 +291,8 @@ static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
vst1q_u16(dest, d7_u16);
}
-void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
int32x4_t a0 = vld1q_s32(input);
int32x4_t a1 = vld1q_s32(input + 8);
int32x4_t a2 = vld1q_s32(input + 16);
@@ -553,9 +551,8 @@ static INLINE void idct8x8_64_half1d_bd12(
*io7 = vsubq_s32(step1[0], step2[7]);
}
-void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
int32x4_t a0 = vld1q_s32(input);
int32x4_t a1 = vld1q_s32(input + 4);
int32x4_t a2 = vld1q_s32(input + 8);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
index 1fde13e8d6d..74345e1facf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -135,18 +135,16 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
return d;
}
-void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
+void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, // unused
int y_step_q4, // unused
int w, int h, int bd) {
if (x_step_q4 != 16) {
- vpx_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
} else {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
const int16x8_t filters = vld1q_s16(filter_x);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
uint16x8_t t0, t1, t2, t3;
@@ -336,20 +334,17 @@ void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
}
}
-void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8,
- ptrdiff_t src_stride, uint8_t *dst8,
+void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
+ ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, // unused
int y_step_q4, // unused
int w, int h, int bd) {
if (x_step_q4 != 16) {
- vpx_highbd_convolve8_avg_horiz_c(src8, src_stride, dst8, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h, bd);
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
} else {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
const int16x8_t filters = vld1q_s16(filter_x);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
uint16x8_t t0, t1, t2, t3;
@@ -569,18 +564,16 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8,
}
}
-void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
+void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, // unused
int x_step_q4, // unused
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
if (y_step_q4 != 16) {
- vpx_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
} else {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
const int16x8_t filters = vld1q_s16(filter_y);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
@@ -736,20 +729,17 @@ void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
}
}
-void vpx_highbd_convolve8_avg_vert_neon(const uint8_t *src8,
- ptrdiff_t src_stride, uint8_t *dst8,
+void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
+ ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_x, // unused
int x_step_q4, // unused
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
if (y_step_q4 != 16) {
- vpx_highbd_convolve8_avg_vert_c(src8, src_stride, dst8, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4, w,
- h, bd);
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
} else {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
const int16x8_t filters = vld1q_s16(filter_y);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
index f4d70761eb3..4ff3dea085e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
@@ -13,14 +13,11 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
+void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-
(void)filter_x;
(void)filter_x_stride;
(void)filter_y;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
index a980ab1a380..61712d48e3c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -13,14 +13,11 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
+void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-
(void)filter_x;
(void)filter_x_stride;
(void)filter_y;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
index 4e6e109920a..f769620a43b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
@@ -13,12 +13,11 @@
#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
-void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h, int bd) {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
// + 1 to make it divisible by 4
DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
@@ -29,23 +28,20 @@ void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride,
* height and filter a multiple of 4 lines. Since this goes in to the temp
* buffer which has lots of extra room and is subsequently discarded this is
* safe if somewhat less than ideal. */
- vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3),
- src_stride, CONVERT_TO_BYTEPTR(temp), w,
+ vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
filter_x, x_step_q4, filter_y, y_step_q4, w,
intermediate_height, bd);
/* Step into the temp buffer 3 lines to get the actual frame data */
- vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst,
- dst_stride, filter_x, x_step_q4, filter_y,
- y_step_q4, w, h, bd);
+ vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
}
-void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
// + 1 to make it divisible by 4
DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
@@ -55,11 +51,9 @@ void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride,
/* This implementation has the same issues as above. In addition, we only want
* to average the values after both passes.
*/
- vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3),
- src_stride, CONVERT_TO_BYTEPTR(temp), w,
+ vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
filter_x, x_step_q4, filter_y, y_step_q4, w,
intermediate_height, bd);
- vpx_highbd_convolve8_avg_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst,
- dst_stride, filter_x, x_step_q4, filter_y,
- y_step_q4, w, h, bd);
+ vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
index 828fb5f6c71..5c5963d277e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -12,6 +12,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/txfm_common.h"
static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
index b398259918a..021211bc990 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
index fc0c4cd8462..f3c336fa31f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
index 34b5baf7236..9f4589ea968 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
@@ -517,7 +518,7 @@ void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1
int16_t *out;
int16x8_t q[16];
- uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
+ uint16_t *dst = CAST_TO_SHORTPTR(dest);
for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
idct32_pass_loop++, out = pass2) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
index d1eae24a222..21d21b03368 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -12,6 +12,7 @@
#include <assert.h>
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/inv_txfm.h"
static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
index bff98cbc169..673a36840e3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
@@ -13,13 +13,14 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/txfm_common.h"
void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
const uint8_t *dst = dest;
const int16x4_t cospis = vld1_s16(kCospi);
- uint32x2_t dest01_u32 = vdup_n_u32(0);
+ uint8x8_t dest01_u8;
uint32x2_t dest32_u32 = vdup_n_u32(0);
int16x8_t a0, a1;
uint8x8_t d01, d32;
@@ -39,25 +40,22 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
a0 = vrshrq_n_s16(a0, 4);
a1 = vrshrq_n_s16(a1, 4);
- dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0);
- dst += stride;
- dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1);
- dst += stride;
+ dest01_u8 = load_u8(dst, stride);
+ dst += 2 * stride;
+ // The elements are loaded in reverse order.
dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1);
dst += stride;
dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0);
- d01_u16 =
- vaddw_u8(vreinterpretq_u16_s16(a0), vreinterpret_u8_u32(dest01_u32));
+ d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8);
d32_u16 =
vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32));
d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16));
d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16));
- vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0);
- dest += stride;
- vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1);
- dest += stride;
+ store_u8(dest, stride, d01);
+ dest += 2 * stride;
+ // The elements are stored in reverse order.
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1);
dest += stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
index 279da67d74f..1121ade2796 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
index 27c784edca9..0fc1de8e491 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
@@ -41,58 +41,6 @@ DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = {
};
//------------------------------------------------------------------------------
-// Helper functions used to load tran_low_t into int16, narrowing if necessary.
-
-static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int32x4x2_t v0 = vld2q_s32(buf);
- const int32x4x2_t v1 = vld2q_s32(buf + 8);
- const int16x4_t s0 = vmovn_s32(v0.val[0]);
- const int16x4_t s1 = vmovn_s32(v0.val[1]);
- const int16x4_t s2 = vmovn_s32(v1.val[0]);
- const int16x4_t s3 = vmovn_s32(v1.val[1]);
- int16x8x2_t res;
- res.val[0] = vcombine_s16(s0, s2);
- res.val[1] = vcombine_s16(s1, s3);
- return res;
-#else
- return vld2q_s16(buf);
-#endif
-}
-
-static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int32x4_t v0 = vld1q_s32(buf);
- const int32x4_t v1 = vld1q_s32(buf + 4);
- const int16x4_t s0 = vmovn_s32(v0);
- const int16x4_t s1 = vmovn_s32(v1);
- return vcombine_s16(s0, s1);
-#else
- return vld1q_s16(buf);
-#endif
-}
-
-static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int32x4_t v0 = vld1q_s32(buf);
- return vmovn_s32(v0);
-#else
- return vld1_s16(buf);
-#endif
-}
-
-static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
- const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
- vst1q_s32(buf, v0);
- vst1q_s32(buf + 4, v1);
-#else
- vst1q_s16(buf, a);
-#endif
-}
-
-//------------------------------------------------------------------------------
// Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth
static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {
#if CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h
new file mode 100644
index 00000000000..ba5c3d513d4
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/mem_neon.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_MEM_NEON_H_
+#define VPX_DSP_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Helper functions used to load tran_low_t into int16, narrowing if necessary.
+static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4x2_t v0 = vld2q_s32(buf);
+ const int32x4x2_t v1 = vld2q_s32(buf + 8);
+ const int16x4_t s0 = vmovn_s32(v0.val[0]);
+ const int16x4_t s1 = vmovn_s32(v0.val[1]);
+ const int16x4_t s2 = vmovn_s32(v1.val[0]);
+ const int16x4_t s3 = vmovn_s32(v1.val[1]);
+ int16x8x2_t res;
+ res.val[0] = vcombine_s16(s0, s2);
+ res.val[1] = vcombine_s16(s1, s3);
+ return res;
+#else
+ return vld2q_s16(buf);
+#endif
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vld1q_s32(buf);
+ const int32x4_t v1 = vld1q_s32(buf + 4);
+ const int16x4_t s0 = vmovn_s32(v0);
+ const int16x4_t s1 = vmovn_s32(v1);
+ return vcombine_s16(s0, s1);
+#else
+ return vld1q_s16(buf);
+#endif
+}
+
+static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vld1q_s32(buf);
+ return vmovn_s32(v0);
+#else
+ return vld1_s16(buf);
+#endif
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+ const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+ vst1q_s32(buf, v0);
+ vst1q_s32(buf + 4, v1);
+#else
+ vst1q_s16(buf, a);
+#endif
+}
+
+// Load 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
+ uint32x2_t a = vdup_n_u32(0);
+
+ assert(!((intptr_t)buf % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ a = vld1_lane_u32((const uint32_t *)buf, a, 0);
+ buf += stride;
+ a = vld1_lane_u32((const uint32_t *)buf, a, 1);
+ return vreinterpret_u8_u32(a);
+}
+
+// Store 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
+ uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+
+ assert(!((intptr_t)buf % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ vst1_lane_u32((uint32_t *)buf, a_u32, 0);
+ buf += stride;
+ vst1_lane_u32((uint32_t *)buf, a_u32, 1);
+}
+#endif // VPX_DSP_ARM_MEM_NEON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c
index f044e11a155..9b1622ff038 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -22,12 +22,12 @@ static const uint8_t bilinear_filters[8][2] = {
{ 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
};
+// Process a block exactly 8 wide and any height.
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
uint8_t *output_ptr,
unsigned int src_pixels_per_line,
int pixel_step,
unsigned int output_height,
- unsigned int output_width,
const uint8_t *filter) {
const uint8x8_t f0 = vmov_n_u8(filter[0]);
const uint8x8_t f1 = vmov_n_u8(filter[1]);
@@ -41,10 +41,11 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
vst1_u8(&output_ptr[0], out);
// Next row...
src_ptr += src_pixels_per_line;
- output_ptr += output_width;
+ output_ptr += 8;
}
}
+// Process a block which is a mutiple of 16 wide and any height.
static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
uint8_t *output_ptr,
unsigned int src_pixels_per_line,
@@ -73,61 +74,36 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
}
}
-unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
- int xoffset, int yoffset,
- const uint8_t *dst, int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
-
- var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
- bilinear_filters[xoffset]);
- var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
- bilinear_filters[yoffset]);
- return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
-}
-
-unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
- int src_stride, int xoffset,
- int yoffset, const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
-
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
- bilinear_filters[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
- bilinear_filters[yoffset]);
- return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
-}
-
-unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
- int src_stride, int xoffset,
- int yoffset, const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
-
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
- bilinear_filters[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
- bilinear_filters[yoffset]);
- return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
-}
-
-unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
- int src_stride, int xoffset,
- int yoffset, const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
+// TODO(johannkoenig): support 4xM block sizes.
+#define sub_pixel_varianceNxM(n, m) \
+ unsigned int vpx_sub_pixel_variance##n##x##m##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse) { \
+ DECLARE_ALIGNED(16, uint8_t, fdata3[n * (m + 1)]); \
+ DECLARE_ALIGNED(16, uint8_t, temp2[n * m]); \
+ \
+ if (n == 8) { \
+ var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, (m + 1), \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w8(fdata3, temp2, n, n, m, \
+ bilinear_filters[yoffset]); \
+ } else { \
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, (m + 1), n, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w16(fdata3, temp2, n, n, m, n, \
+ bilinear_filters[yoffset]); \
+ } \
+ return vpx_variance##n##x##m(temp2, n, dst, dst_stride, sse); \
+ }
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
- bilinear_filters[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
- bilinear_filters[yoffset]);
- return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
-}
+sub_pixel_varianceNxM(8, 4);
+sub_pixel_varianceNxM(8, 8);
+sub_pixel_varianceNxM(8, 16);
+sub_pixel_varianceNxM(16, 8);
+sub_pixel_varianceNxM(16, 16);
+sub_pixel_varianceNxM(16, 32);
+sub_pixel_varianceNxM(32, 16);
+sub_pixel_varianceNxM(32, 32);
+sub_pixel_varianceNxM(32, 64);
+sub_pixel_varianceNxM(64, 32);
+sub_pixel_varianceNxM(64, 64);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c
index b6d7f86a4b2..c0828e8f639 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/variance_neon.c
@@ -31,77 +31,129 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
return vget_lane_s32(c, 0);
}
-// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int w, int h, uint32_t *sse,
- int *sum) {
+// w * h must be less than 2048 or sum_s16 may overflow.
+// Process a block of any size where the width is divisible by 16.
+static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h, uint32_t *sse,
+ int *sum) {
int i, j;
- int16x8_t v_sum = vdupq_n_s16(0);
- int32x4_t v_sse_lo = vdupq_n_s32(0);
- int32x4_t v_sse_hi = vdupq_n_s32(0);
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_lo_s32 = vdupq_n_s32(0);
+ int32x4_t sse_hi_s32 = vdupq_n_s32(0);
for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- const uint8x8_t v_a = vld1_u8(&a[j]);
- const uint8x8_t v_b = vld1_u8(&b[j]);
- const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
- const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
- v_sum = vaddq_s16(v_sum, sv_diff);
- v_sse_lo =
- vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
- v_sse_hi =
- vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+ for (j = 0; j < w; j += 16) {
+ const uint8x16_t a_u8 = vld1q_u8(a + j);
+ const uint8x16_t b_u8 = vld1q_u8(b + j);
+
+ const uint16x8_t diff_lo_u16 =
+ vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
+ const uint16x8_t diff_hi_u16 =
+ vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
+
+ const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
+ const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
+
+ sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+ sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+ sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
+ vget_low_s16(diff_lo_s16));
+ sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
+ vget_high_s16(diff_lo_s16));
+
+ sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
+ vget_low_s16(diff_hi_s16));
+ sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
+ vget_high_s16(diff_hi_s16));
}
a += a_stride;
b += b_stride;
}
- *sum = horizontal_add_s16x8(v_sum);
- *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+ *sum = horizontal_add_s16x8(sum_s16);
+ *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
+}
+
+// w * h must be less than 2048 or sum_s16 may overflow.
+// Process a block of width 8 two rows at a time.
+static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int h, uint32_t *sse, int *sum) {
+ int i = 0;
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_lo_s32 = vdupq_n_s32(0);
+ int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+
+ do {
+ const uint8x8_t a_0_u8 = vld1_u8(a);
+ const uint8x8_t a_1_u8 = vld1_u8(a + a_stride);
+ const uint8x8_t b_0_u8 = vld1_u8(b);
+ const uint8x8_t b_1_u8 = vld1_u8(b + b_stride);
+ const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8);
+ const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8);
+ const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16);
+ const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16);
+ sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
+ sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
+ sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16),
+ vget_low_s16(diff_0_s16));
+ sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16),
+ vget_low_s16(diff_1_s16));
+ sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16),
+ vget_high_s16(diff_0_s16));
+ sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16),
+ vget_high_s16(diff_1_s16));
+ a += a_stride + a_stride;
+ b += b_stride + b_stride;
+ i += 2;
+ } while (i < h);
+
+ *sum = horizontal_add_s16x8(sum_s16);
+ *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
}
void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, unsigned int *sse, int *sum) {
- variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+ variance_neon_w8x2(a, a_stride, b, b_stride, 8, sse, sum);
}
void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, unsigned int *sse, int *sum) {
- variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
-}
-
-unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
- return *sse - ((sum * sum) >> 6);
+ variance_neon_w16(a, a_stride, b, b_stride, 16, 16, sse, sum);
}
-unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
- return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
-}
+#define varianceNxM(n, m, shift) \
+ unsigned int vpx_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ if (n == 8) \
+ variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum); \
+ else \
+ variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum); \
+ if (n * m < 16 * 16) \
+ return *sse - ((sum * sum) >> shift); \
+ else \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ }
-unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
- return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
+varianceNxM(8, 4, 5);
+varianceNxM(8, 8, 6);
+varianceNxM(8, 16, 7);
+varianceNxM(16, 8, 7);
+varianceNxM(16, 16, 8);
+varianceNxM(16, 32, 9);
+varianceNxM(32, 16, 9);
+varianceNxM(32, 32, 10);
unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum1, sum2;
uint32_t sse1, sse2;
- variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
- variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
- 32, 32, &sse2, &sum2);
+ variance_neon_w16(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+ variance_neon_w16(a + (32 * a_stride), a_stride, b + (32 * b_stride),
+ b_stride, 32, 32, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
@@ -112,9 +164,9 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
unsigned int *sse) {
int sum1, sum2;
uint32_t sse1, sse2;
- variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
- variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
- 64, 16, &sse2, &sum2);
+ variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+ variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride),
+ b_stride, 64, 16, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
@@ -126,162 +178,24 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
int sum1, sum2;
uint32_t sse1, sse2;
- variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
- variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
- 64, 16, &sse2, &sum2);
+ variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+ variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride),
+ b_stride, 64, 16, &sse2, &sum2);
sse1 += sse2;
sum1 += sum2;
- variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
- b_stride, 64, 16, &sse2, &sum2);
+ variance_neon_w16(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
+ b_stride, 64, 16, &sse2, &sum2);
sse1 += sse2;
sum1 += sum2;
- variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
- b_stride, 64, 16, &sse2, &sum2);
+ variance_neon_w16(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
+ b_stride, 64, 16, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
}
-unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride, unsigned int *sse) {
- int i;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 4; i++) {
- q0u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q1u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- __builtin_prefetch(src_ptr);
-
- q2u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q3u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- __builtin_prefetch(ref_ptr);
-
- q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
- q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
- q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
- q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
- q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
- q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
- d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride, unsigned int *sse) {
- int i;
- uint8x8_t d0u8, d2u8, d4u8, d6u8;
- int16x4_t d22s16, d23s16, d24s16, d25s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64;
- uint16x8_t q11u16, q12u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 8; i++) {
- d0u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d2u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- __builtin_prefetch(src_ptr);
-
- d4u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d6u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- __builtin_prefetch(ref_ptr);
-
- q11u16 = vsubl_u8(d0u8, d4u8);
- q12u16 = vsubl_u8(d2u8, d6u8);
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
- d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
-
unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
const unsigned char *ref_ptr, int recon_stride,
unsigned int *sse) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
index 210a9bed962..29323d1b899 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
@@ -1182,16 +1182,10 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
// Rows
for (i = 0; i < 32; ++i) {
- int16_t zero_coeff[16];
- for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
- for (j = 0; j < 8; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 4; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 2; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
- if (zero_coeff[0] | zero_coeff[1])
+ int16_t zero_coeff = 0;
+ for (j = 0; j < 32; ++j) zero_coeff |= input[j];
+
+ if (zero_coeff)
idct32_c(input, outptr);
else
memset(outptr, 0, sizeof(tran_low_t) * 32);
@@ -1290,7 +1284,7 @@ static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
return 0;
}
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
0.5 shifts per pixel. */
@@ -1299,7 +1293,6 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
tran_high_t a1, b1, c1, d1, e1;
const tran_low_t *ip = input;
tran_low_t *op = output;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
for (i = 0; i < 4; i++) {
a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1348,14 +1341,13 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
int stride, int bd) {
int i;
tran_high_t a1, e1;
tran_low_t tmp[4];
const tran_low_t *ip = in;
tran_low_t *op = tmp;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
(void)bd;
a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1452,13 +1444,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
}
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[4 * 4];
tran_low_t *outptr = out;
tran_low_t temp_in[4], temp_out[4];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// Rows
for (i = 0; i < 4; ++i) {
@@ -1478,13 +1469,12 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i;
tran_high_t a1;
tran_low_t out =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 4);
@@ -1636,13 +1626,12 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
}
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
tran_low_t temp_in[8], temp_out[8];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// First transform rows
for (i = 0; i < 8; ++i) {
@@ -1662,13 +1651,12 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[8 * 8] = { 0 };
tran_low_t *outptr = out;
tran_low_t temp_in[8], temp_out[8];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// First transform rows
// Only first 4 row has non-zero coefs
@@ -1689,13 +1677,12 @@ void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_high_t a1;
tran_low_t out =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 5);
@@ -2056,13 +2043,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
}
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// First transform rows
for (i = 0; i < 16; ++i) {
@@ -2082,13 +2068,12 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[16 * 16] = { 0 };
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
- uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 8x8 area, we only need to calculate first 8 rows here.
@@ -2111,13 +2096,12 @@ void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[16 * 16] = { 0 };
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 4x4 area, we only need to calculate first 4 rows here.
@@ -2138,13 +2122,12 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_high_t a1;
tran_low_t out =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -2531,26 +2514,19 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
}
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[32 * 32];
tran_low_t *outptr = out;
tran_low_t temp_in[32], temp_out[32];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// Rows
for (i = 0; i < 32; ++i) {
- tran_low_t zero_coeff[16];
- for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
- for (j = 0; j < 8; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 4; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 2; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
- if (zero_coeff[0] | zero_coeff[1])
+ tran_low_t zero_coeff = 0;
+ for (j = 0; j < 32; ++j) zero_coeff |= input[j];
+
+ if (zero_coeff)
highbd_idct32_c(input, outptr, bd);
else
memset(outptr, 0, sizeof(tran_low_t) * 32);
@@ -2569,13 +2545,12 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out;
tran_low_t temp_in[32], temp_out[32];
- uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);
// Rows
// Only upper-left 16x16 has non-zero coeff
@@ -2598,13 +2573,12 @@ void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out;
tran_low_t temp_in[32], temp_out[32];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// Rows
// Only upper-left 8x8 has non-zero coeff
@@ -2625,11 +2599,10 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
int a1;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
tran_low_t out =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/intrapred_vsx.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/intrapred_vsx.c
new file mode 100644
index 00000000000..6273460f190
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/intrapred_vsx.c
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d = vec_vsx_ld(0, above);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 16; i++, dst += stride) {
+ vec_vsx_st(d, 0, dst);
+ }
+}
+
+void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vec_vsx_ld(0, above);
+ const uint8x16_t d1 = vec_vsx_ld(16, above);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 32; i++, dst += stride) {
+ vec_vsx_st(d0, 0, dst);
+ vec_vsx_st(d1, 16, dst);
+ }
+}
+
+static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+
+void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d = vec_vsx_ld(0, left);
+ const uint8x16_t v0 = vec_splat(d, 0);
+ const uint8x16_t v1 = vec_splat(d, 1);
+ const uint8x16_t v2 = vec_splat(d, 2);
+ const uint8x16_t v3 = vec_splat(d, 3);
+
+ (void)above;
+
+ vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+ vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+ vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+ vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+}
+
+void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d = vec_vsx_ld(0, left);
+ const uint8x16_t v0 = vec_splat(d, 0);
+ const uint8x16_t v1 = vec_splat(d, 1);
+ const uint8x16_t v2 = vec_splat(d, 2);
+ const uint8x16_t v3 = vec_splat(d, 3);
+
+ const uint8x16_t v4 = vec_splat(d, 4);
+ const uint8x16_t v5 = vec_splat(d, 5);
+ const uint8x16_t v6 = vec_splat(d, 6);
+ const uint8x16_t v7 = vec_splat(d, 7);
+
+ (void)above;
+
+ vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst);
+ dst += stride;
+ vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst);
+}
+
+void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d = vec_vsx_ld(0, left);
+ const uint8x16_t v0 = vec_splat(d, 0);
+ const uint8x16_t v1 = vec_splat(d, 1);
+ const uint8x16_t v2 = vec_splat(d, 2);
+ const uint8x16_t v3 = vec_splat(d, 3);
+
+ const uint8x16_t v4 = vec_splat(d, 4);
+ const uint8x16_t v5 = vec_splat(d, 5);
+ const uint8x16_t v6 = vec_splat(d, 6);
+ const uint8x16_t v7 = vec_splat(d, 7);
+
+ const uint8x16_t v8 = vec_splat(d, 8);
+ const uint8x16_t v9 = vec_splat(d, 9);
+ const uint8x16_t v10 = vec_splat(d, 10);
+ const uint8x16_t v11 = vec_splat(d, 11);
+
+ const uint8x16_t v12 = vec_splat(d, 12);
+ const uint8x16_t v13 = vec_splat(d, 13);
+ const uint8x16_t v14 = vec_splat(d, 14);
+ const uint8x16_t v15 = vec_splat(d, 15);
+
+ (void)above;
+
+ vec_vsx_st(v0, 0, dst);
+ dst += stride;
+ vec_vsx_st(v1, 0, dst);
+ dst += stride;
+ vec_vsx_st(v2, 0, dst);
+ dst += stride;
+ vec_vsx_st(v3, 0, dst);
+ dst += stride;
+ vec_vsx_st(v4, 0, dst);
+ dst += stride;
+ vec_vsx_st(v5, 0, dst);
+ dst += stride;
+ vec_vsx_st(v6, 0, dst);
+ dst += stride;
+ vec_vsx_st(v7, 0, dst);
+ dst += stride;
+ vec_vsx_st(v8, 0, dst);
+ dst += stride;
+ vec_vsx_st(v9, 0, dst);
+ dst += stride;
+ vec_vsx_st(v10, 0, dst);
+ dst += stride;
+ vec_vsx_st(v11, 0, dst);
+ dst += stride;
+ vec_vsx_st(v12, 0, dst);
+ dst += stride;
+ vec_vsx_st(v13, 0, dst);
+ dst += stride;
+ vec_vsx_st(v14, 0, dst);
+ dst += stride;
+ vec_vsx_st(v15, 0, dst);
+}
+
+#define H_PREDICTOR_32(v) \
+ vec_vsx_st(v, 0, dst); \
+ vec_vsx_st(v, 16, dst); \
+ dst += stride
+
+void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vec_vsx_ld(0, left);
+ const uint8x16_t d1 = vec_vsx_ld(16, left);
+
+ const uint8x16_t v0_0 = vec_splat(d0, 0);
+ const uint8x16_t v1_0 = vec_splat(d0, 1);
+ const uint8x16_t v2_0 = vec_splat(d0, 2);
+ const uint8x16_t v3_0 = vec_splat(d0, 3);
+ const uint8x16_t v4_0 = vec_splat(d0, 4);
+ const uint8x16_t v5_0 = vec_splat(d0, 5);
+ const uint8x16_t v6_0 = vec_splat(d0, 6);
+ const uint8x16_t v7_0 = vec_splat(d0, 7);
+ const uint8x16_t v8_0 = vec_splat(d0, 8);
+ const uint8x16_t v9_0 = vec_splat(d0, 9);
+ const uint8x16_t v10_0 = vec_splat(d0, 10);
+ const uint8x16_t v11_0 = vec_splat(d0, 11);
+ const uint8x16_t v12_0 = vec_splat(d0, 12);
+ const uint8x16_t v13_0 = vec_splat(d0, 13);
+ const uint8x16_t v14_0 = vec_splat(d0, 14);
+ const uint8x16_t v15_0 = vec_splat(d0, 15);
+
+ const uint8x16_t v0_1 = vec_splat(d1, 0);
+ const uint8x16_t v1_1 = vec_splat(d1, 1);
+ const uint8x16_t v2_1 = vec_splat(d1, 2);
+ const uint8x16_t v3_1 = vec_splat(d1, 3);
+ const uint8x16_t v4_1 = vec_splat(d1, 4);
+ const uint8x16_t v5_1 = vec_splat(d1, 5);
+ const uint8x16_t v6_1 = vec_splat(d1, 6);
+ const uint8x16_t v7_1 = vec_splat(d1, 7);
+ const uint8x16_t v8_1 = vec_splat(d1, 8);
+ const uint8x16_t v9_1 = vec_splat(d1, 9);
+ const uint8x16_t v10_1 = vec_splat(d1, 10);
+ const uint8x16_t v11_1 = vec_splat(d1, 11);
+ const uint8x16_t v12_1 = vec_splat(d1, 12);
+ const uint8x16_t v13_1 = vec_splat(d1, 13);
+ const uint8x16_t v14_1 = vec_splat(d1, 14);
+ const uint8x16_t v15_1 = vec_splat(d1, 15);
+
+ (void)above;
+
+ H_PREDICTOR_32(v0_0);
+ H_PREDICTOR_32(v1_0);
+ H_PREDICTOR_32(v2_0);
+ H_PREDICTOR_32(v3_0);
+
+ H_PREDICTOR_32(v4_0);
+ H_PREDICTOR_32(v5_0);
+ H_PREDICTOR_32(v6_0);
+ H_PREDICTOR_32(v7_0);
+
+ H_PREDICTOR_32(v8_0);
+ H_PREDICTOR_32(v9_0);
+ H_PREDICTOR_32(v10_0);
+ H_PREDICTOR_32(v11_0);
+
+ H_PREDICTOR_32(v12_0);
+ H_PREDICTOR_32(v13_0);
+ H_PREDICTOR_32(v14_0);
+ H_PREDICTOR_32(v15_0);
+
+ H_PREDICTOR_32(v0_1);
+ H_PREDICTOR_32(v1_1);
+ H_PREDICTOR_32(v2_1);
+ H_PREDICTOR_32(v3_1);
+
+ H_PREDICTOR_32(v4_1);
+ H_PREDICTOR_32(v5_1);
+ H_PREDICTOR_32(v6_1);
+ H_PREDICTOR_32(v7_1);
+
+ H_PREDICTOR_32(v8_1);
+ H_PREDICTOR_32(v9_1);
+ H_PREDICTOR_32(v10_1);
+ H_PREDICTOR_32(v11_1);
+
+ H_PREDICTOR_32(v12_1);
+ H_PREDICTOR_32(v13_1);
+ H_PREDICTOR_32(v14_1);
+ H_PREDICTOR_32(v15_1);
+}
+
+void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+ const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
+ const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
+ int16x8_t tmp, val;
+ uint8x16_t d;
+
+ d = vec_vsx_ld(0, dst);
+ tmp = unpack_to_s16_l(d);
+ val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
+ vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+
+ d = vec_vsx_ld(0, dst);
+ tmp = unpack_to_s16_l(d);
+ val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
+ vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+
+ d = vec_vsx_ld(0, dst);
+ tmp = unpack_to_s16_l(d);
+ val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
+ vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+ dst += stride;
+
+ d = vec_vsx_ld(0, dst);
+ tmp = unpack_to_s16_l(d);
+ val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
+ vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+}
+
+void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+ const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
+ const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
+ int16x8_t tmp, val;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+ dst += stride;
+
+ tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+ val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
+ vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+}
+
+static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
+ int16x8_t ah, int16x8_t al, int16x8_t tl) {
+ int16x8_t vh, vl, ls;
+
+ ls = vec_splat(l, 0);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 1);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 2);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 3);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 4);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 5);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 6);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ dst += stride;
+
+ ls = vec_splat(l, 7);
+ vh = vec_sub(vec_add(ls, ah), tl);
+ vl = vec_sub(vec_add(ls, al), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+}
+
+void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+ const uint8x16_t l = vec_vsx_ld(0, left);
+ const int16x8_t lh = unpack_to_s16_h(l);
+ const int16x8_t ll = unpack_to_s16_l(l);
+ const uint8x16_t a = vec_vsx_ld(0, above);
+ const int16x8_t ah = unpack_to_s16_h(a);
+ const int16x8_t al = unpack_to_s16_l(a);
+
+ tm_predictor_16x8(dst, stride, lh, ah, al, tl);
+
+ dst += stride * 8;
+
+ tm_predictor_16x8(dst, stride, ll, ah, al, tl);
+}
+
+static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
+ const int16x8_t a0h, const int16x8_t a0l,
+ const int16x8_t a1h, const int16x8_t a1l,
+ const int16x8_t tl) {
+ int16x8_t vh, vl;
+
+ vh = vec_sub(vec_add(ls, a0h), tl);
+ vl = vec_sub(vec_add(ls, a0l), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+ vh = vec_sub(vec_add(ls, a1h), tl);
+ vl = vec_sub(vec_add(ls, a1l), tl);
+ vec_vsx_st(vec_packsu(vh, vl), 16, dst);
+}
+
+static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
+ const int16x8_t l, const uint8x16_t a0,
+ const uint8x16_t a1, const int16x8_t tl) {
+ const int16x8_t a0h = unpack_to_s16_h(a0);
+ const int16x8_t a0l = unpack_to_s16_l(a0);
+ const int16x8_t a1h = unpack_to_s16_h(a1);
+ const int16x8_t a1l = unpack_to_s16_l(a1);
+
+ tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
+ dst += stride;
+
+ tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
+}
+
+void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+ const uint8x16_t l0 = vec_vsx_ld(0, left);
+ const uint8x16_t l1 = vec_vsx_ld(16, left);
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t a1 = vec_vsx_ld(16, above);
+
+ tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
+ dst += stride * 8;
+
+ tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
+ dst += stride * 8;
+
+ tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
+ dst += stride * 8;
+
+ tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
+}
+
+static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
+ const uint8x16_t val) {
+ int i;
+
+ for (i = 0; i < 8; i++, dst += stride) {
+ const uint8x16_t d = vec_vsx_ld(0, dst);
+ vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
+ }
+}
+
+static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
+ const uint8x16_t val) {
+ int i;
+
+ for (i = 0; i < 16; i++, dst += stride) {
+ vec_vsx_st(val, 0, dst);
+ }
+}
+
+void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
+ (void)above;
+ (void)left;
+
+ dc_fill_predictor_16x16(dst, stride, v128);
+}
+
+static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
+ const uint8x16_t val) {
+ int i;
+
+ for (i = 0; i < 32; i++, dst += stride) {
+ vec_vsx_st(val, 0, dst);
+ vec_vsx_st(val, 16, dst);
+ }
+}
+
+void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
+ (void)above;
+ (void)left;
+
+ dc_fill_predictor_32x32(dst, stride, v128);
+}
+
+static uint8x16_t avg16(const uint8_t *values) {
+ const int32x4_t sum4s =
+ (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
+ const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
+ const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
+
+ return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+ 3);
+}
+
+void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+
+ dc_fill_predictor_16x16(dst, stride, avg16(left));
+}
+
+void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ dc_fill_predictor_16x16(dst, stride, avg16(above));
+}
+
+static uint8x16_t avg32(const uint8_t *values) {
+ const uint8x16_t v0 = vec_vsx_ld(0, values);
+ const uint8x16_t v1 = vec_vsx_ld(16, values);
+ const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
+ const int32x4_t sum4s =
+ (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
+ const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
+ const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
+
+ return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+ 3);
+}
+
+void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+
+ dc_fill_predictor_32x32(dst, stride, avg32(left));
+}
+
+void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+
+ dc_fill_predictor_32x32(dst, stride, avg32(above));
+}
+
+static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t l0 = vec_vsx_ld(0, left);
+ const int32x4_t sum4s =
+ (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
+ const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
+ const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
+ const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
+
+ return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+ 3);
+}
+
+static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t l0 = vec_vsx_ld(0, left);
+ const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
+ const int32x4_t sum4s =
+ (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
+ const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
+ const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
+
+ return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+ 3);
+}
+
+void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
+}
+
+void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
+}
+
+static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t a1 = vec_vsx_ld(16, above);
+ const uint8x16_t l0 = vec_vsx_ld(0, left);
+ const uint8x16_t l1 = vec_vsx_ld(16, left);
+ const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
+ const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
+ const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
+ const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
+ const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
+
+ return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+ 3);
+}
+
+void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
+}
+
+static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
+ const uint8x16_t c) {
+ const uint8x16_t ac =
+ vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
+
+ return vec_avg(ac, b);
+}
+
+// Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
+static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+ 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
+
+void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t af = vec_vsx_ld(0, above);
+ const uint8x16_t above_right = vec_splat(af, 7);
+ const uint8x16_t a = xxpermdi(af, above_right, 1);
+ const uint8x16_t b = vec_perm(a, above_right, sl1);
+ const uint8x16_t c = vec_perm(b, above_right, sl1);
+ uint8x16_t row = avg3(a, b, c);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 8; i++) {
+ const uint8x16_t d = vec_vsx_ld(0, dst);
+ vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
+ dst += stride;
+ row = vec_perm(row, above_right, sl1);
+ }
+}
+
+void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a = vec_vsx_ld(0, above);
+ const uint8x16_t above_right = vec_splat(a, 15);
+ const uint8x16_t b = vec_perm(a, above_right, sl1);
+ const uint8x16_t c = vec_perm(b, above_right, sl1);
+ uint8x16_t row = avg3(a, b, c);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 16; i++) {
+ vec_vsx_st(row, 0, dst);
+ dst += stride;
+ row = vec_perm(row, above_right, sl1);
+ }
+}
+
+void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t a1 = vec_vsx_ld(16, above);
+ const uint8x16_t above_right = vec_splat(a1, 15);
+ const uint8x16_t b0 = vec_perm(a0, a1, sl1);
+ const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
+ const uint8x16_t c0 = vec_perm(b0, b1, sl1);
+ const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
+ uint8x16_t row0 = avg3(a0, b0, c0);
+ uint8x16_t row1 = avg3(a1, b1, c1);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 32; i++) {
+ vec_vsx_st(row0, 0, dst);
+ vec_vsx_st(row1, 16, dst);
+ dst += stride;
+ row0 = vec_perm(row0, row1, sl1);
+ row1 = vec_perm(row1, above_right, sl1);
+ }
+}
+
+void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t af = vec_vsx_ld(0, above);
+ const uint8x16_t above_right = vec_splat(af, 9);
+ const uint8x16_t a = xxpermdi(af, above_right, 1);
+ const uint8x16_t b = vec_perm(a, above_right, sl1);
+ const uint8x16_t c = vec_perm(b, above_right, sl1);
+ uint8x16_t row0 = vec_avg(a, b);
+ uint8x16_t row1 = avg3(a, b, c);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 4; i++) {
+ const uint8x16_t d0 = vec_vsx_ld(0, dst);
+ const uint8x16_t d1 = vec_vsx_ld(0, dst + stride);
+ vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst);
+ vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride);
+ dst += stride * 2;
+ row0 = vec_perm(row0, above_right, sl1);
+ row1 = vec_perm(row1, above_right, sl1);
+ }
+}
+
+void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t a1 = vec_vsx_ld(16, above);
+ const uint8x16_t above_right = vec_splat(a1, 0);
+ const uint8x16_t b = vec_perm(a0, above_right, sl1);
+ const uint8x16_t c = vec_perm(b, above_right, sl1);
+ uint8x16_t row0 = vec_avg(a0, b);
+ uint8x16_t row1 = avg3(a0, b, c);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 8; i++) {
+ vec_vsx_st(row0, 0, dst);
+ vec_vsx_st(row1, 0, dst + stride);
+ dst += stride * 2;
+ row0 = vec_perm(row0, above_right, sl1);
+ row1 = vec_perm(row1, above_right, sl1);
+ }
+}
+
+void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vec_vsx_ld(0, above);
+ const uint8x16_t a1 = vec_vsx_ld(16, above);
+ const uint8x16_t a2 = vec_vsx_ld(32, above);
+ const uint8x16_t above_right = vec_splat(a2, 0);
+ const uint8x16_t b0 = vec_perm(a0, a1, sl1);
+ const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
+ const uint8x16_t c0 = vec_perm(b0, b1, sl1);
+ const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
+ uint8x16_t row0_0 = vec_avg(a0, b0);
+ uint8x16_t row0_1 = vec_avg(a1, b1);
+ uint8x16_t row1_0 = avg3(a0, b0, c0);
+ uint8x16_t row1_1 = avg3(a1, b1, c1);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 16; i++) {
+ vec_vsx_st(row0_0, 0, dst);
+ vec_vsx_st(row0_1, 16, dst);
+ vec_vsx_st(row1_0, 0, dst + stride);
+ vec_vsx_st(row1_1, 16, dst + stride);
+ dst += stride * 2;
+ row0_0 = vec_perm(row0_0, row0_1, sl1);
+ row0_1 = vec_perm(row0_1, above_right, sl1);
+ row1_0 = vec_perm(row1_0, row1_1, sl1);
+ row1_1 = vec_perm(row1_1, above_right, sl1);
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/sad_vsx.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/sad_vsx.c
new file mode 100644
index 00000000000..3edb40c3158
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/sad_vsx.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "vpx_dsp/ppc/types_vsx.h"
+
+#include "vpx/vpx_integer.h"
+
+#define PROCESS16(offset) \
+ v_a = vec_vsx_ld(offset, a); \
+ v_b = vec_vsx_ld(offset, b); \
+ v_ah = unpack_to_s16_h(v_a); \
+ v_al = unpack_to_s16_l(v_a); \
+ v_bh = unpack_to_s16_h(v_b); \
+ v_bl = unpack_to_s16_l(v_b); \
+ v_subh = vec_sub(v_ah, v_bh); \
+ v_subl = vec_sub(v_al, v_bl); \
+ v_absh = vec_abs(v_subh); \
+ v_absl = vec_abs(v_subl); \
+ v_sad = vec_sum4s(v_absh, v_sad); \
+ v_sad = vec_sum4s(v_absl, v_sad);
+
+#define SAD16(height) \
+ unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride) { \
+ int y; \
+ unsigned int sad[4]; \
+ uint8x16_t v_a, v_b; \
+ int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \
+ int32x4_t v_sad = vec_splat_s32(0); \
+ \
+ for (y = 0; y < height; y++) { \
+ PROCESS16(0); \
+ \
+ a += a_stride; \
+ b += b_stride; \
+ } \
+ vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
+ \
+ return sad[3] + sad[2] + sad[1] + sad[0]; \
+ }
+
+#define SAD32(height) \
+ unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride) { \
+ int y; \
+ unsigned int sad[4]; \
+ uint8x16_t v_a, v_b; \
+ int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \
+ int32x4_t v_sad = vec_splat_s32(0); \
+ \
+ for (y = 0; y < height; y++) { \
+ PROCESS16(0); \
+ PROCESS16(16); \
+ \
+ a += a_stride; \
+ b += b_stride; \
+ } \
+ vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
+ \
+ return sad[3] + sad[2] + sad[1] + sad[0]; \
+ }
+
+#define SAD64(height) \
+ unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride) { \
+ int y; \
+ unsigned int sad[4]; \
+ uint8x16_t v_a, v_b; \
+ int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \
+ int32x4_t v_sad = vec_splat_s32(0); \
+ \
+ for (y = 0; y < height; y++) { \
+ PROCESS16(0); \
+ PROCESS16(16); \
+ PROCESS16(32); \
+ PROCESS16(48); \
+ \
+ a += a_stride; \
+ b += b_stride; \
+ } \
+ vec_vsx_st((uint32x4_t)v_sad, 0, sad); \
+ \
+ return sad[3] + sad[2] + sad[1] + sad[0]; \
+ }
+
+SAD16(8);
+SAD16(16);
+SAD16(32);
+SAD32(16);
+SAD32(32);
+SAD32(64);
+SAD64(32);
+SAD64(64);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/types_vsx.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/types_vsx.h
index 2f3aa20495f..f611d02d2d5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/types_vsx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/types_vsx.h
@@ -13,8 +13,56 @@
#include <altivec.h>
+typedef vector signed char int8x16_t;
+typedef vector unsigned char uint8x16_t;
typedef vector signed short int16x8_t;
typedef vector unsigned short uint16x8_t;
typedef vector signed int int32x4_t;
+typedef vector unsigned int uint32x4_t;
+
+#ifdef __clang__
+static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+ 0x14, 0x15, 0x16, 0x17 };
+static const uint8x16_t xxpermdi1_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B,
+ 0x1C, 0x1D, 0x1E, 0x1F };
+static const uint8x16_t xxpermdi2_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+ 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13,
+ 0x14, 0x15, 0x16, 0x17 };
+static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+ 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B,
+ 0x1C, 0x1D, 0x1E, 0x1F };
+#define xxpermdi(a, b, c) vec_perm(a, b, xxpermdi##c##_perm)
+#elif defined(__GNUC__) && \
+ (__GNUC__ > 6 || (__GNUC__ == 6 && __GNUC_MINOR__ >= 3))
+#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c)
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define unpack_to_u16_h(v) \
+ (uint16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_u16_l(v) \
+ (uint16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_s16_h(v) \
+ (int16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_s16_l(v) \
+ (int16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v)
+#ifndef xxpermdi
+#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c)
+#endif
+#else
+#define unpack_to_u16_h(v) \
+ (uint16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_u16_l(v) \
+ (uint16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_s16_h(v) \
+ (int16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_s16_l(v) \
+ (int16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0))
+#ifndef xxpermdi
+#define xxpermdi(a, b, c) vec_xxpermdi(b, a, ((c >> 1) | (c & 1) << 1) ^ 3)
+#endif
+#endif
#endif // VPX_DSP_PPC_TYPES_VSX_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/variance_vsx.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/variance_vsx.c
new file mode 100644
index 00000000000..1efe2f00569
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/variance_vsx.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static inline uint8x16_t read4x2(const uint8_t *a, int stride) {
+ const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
+ const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
+
+ return (uint8x16_t)vec_mergeh(a0, a1);
+}
+
+uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride) {
+ int distortion;
+
+ const int16x8_t a0 = unpack_to_s16_h(read4x2(a, a_stride));
+ const int16x8_t a1 = unpack_to_s16_h(read4x2(a + a_stride * 2, a_stride));
+ const int16x8_t b0 = unpack_to_s16_h(read4x2(b, b_stride));
+ const int16x8_t b1 = unpack_to_s16_h(read4x2(b + b_stride * 2, b_stride));
+ const int16x8_t d0 = vec_sub(a0, b0);
+ const int16x8_t d1 = vec_sub(a1, b1);
+ const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0)));
+ const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3);
+
+ vec_ste(d, 0, &distortion);
+
+ return distortion;
+}
+
+// TODO(lu_zero): Unroll
+uint32_t vpx_get_mb_ss_vsx(const int16_t *a) {
+ unsigned int i, sum = 0;
+ int32x4_t s = vec_splat_s32(0);
+
+ for (i = 0; i < 256; i += 8) {
+ const int16x8_t v = vec_vsx_ld(0, a + i);
+ s = vec_msum(v, v, s);
+ }
+
+ s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
+
+ vec_ste((uint32x4_t)s, 0, &sum);
+
+ return sum;
+}
+
+void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ int i, j;
+ /* comp_pred and pred must be 16 byte aligned. */
+ assert(((intptr_t)comp_pred & 0xf) == 0);
+ assert(((intptr_t)pred & 0xf) == 0);
+ if (width >= 16) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 16) {
+ const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref));
+ vec_vsx_st(v, j, comp_pred);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else if (width == 8) {
+ // Process 2 lines at time
+ for (i = 0; i < height / 2; ++i) {
+ const uint8x16_t r0 = vec_vsx_ld(0, ref);
+ const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride);
+ const uint8x16_t r = xxpermdi(r0, r1, 0);
+ const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
+ vec_vsx_st(v, 0, comp_pred);
+ comp_pred += 16; // width * 2;
+ pred += 16; // width * 2;
+ ref += ref_stride * 2;
+ }
+ } else {
+ assert(width == 4);
+ // process 4 lines at time
+ for (i = 0; i < height / 4; ++i) {
+ const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref);
+ const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride);
+ const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2);
+ const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3);
+ const uint8x16_t r =
+ (uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0);
+ const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
+ vec_vsx_st(v, 0, comp_pred);
+ comp_pred += 16; // width * 4;
+ pred += 16; // width * 4;
+ ref += ref_stride * 4;
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
new file mode 100644
index 00000000000..55dcdc2baf4
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <string.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// TODO(lu_zero): unroll
+static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+ vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+ vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
+ vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
+ vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int32_t filter_x_stride,
+ const int16_t *filter_y, int32_t filter_y_stride,
+ int32_t w, int32_t h) {
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+
+ switch (w) {
+ case 16: {
+ copy_w16(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ copy_w32(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ copy_w64(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ int i;
+ for (i = h; i--;) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
+
+static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+ vec_vsx_st(v, 0, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+ const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
+ vec_vsx_st(v0, 0, dst);
+ vec_vsx_st(v1, 16, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+ int i;
+
+ for (i = h; i--;) {
+ const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+ const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
+ const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst));
+ const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst));
+ vec_vsx_st(v0, 0, dst);
+ vec_vsx_st(v1, 16, dst);
+ vec_vsx_st(v2, 32, dst);
+ vec_vsx_st(v3, 48, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int32_t filter_x_stride,
+ const int16_t *filter_y, int32_t filter_y_stride,
+ int32_t w, int32_t h) {
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+
+ switch (w) {
+ case 16: {
+ avg_w16(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ avg_w32(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ avg_w64(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x,
+ filter_x_stride, filter_y, filter_y_stride, w, h);
+ break;
+ }
+ }
+}
+
+static inline void convolve_line(uint8_t *dst, const int16x8_t s,
+ const int16x8_t f) {
+ const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0));
+ const int32x4_t bias =
+ vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1));
+ const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS));
+ const uint8x16_t v = vec_splat(
+ vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3);
+ vec_ste(v, 0, dst);
+}
+
+static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x,
+ const int16_t *const x_filter) {
+ const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x));
+ const int16x8_t f = vec_vsx_ld(0, x_filter);
+
+ convolve_line(dst, s, f);
+}
+
+// TODO(lu_zero): Implement 8x8 and bigger block special cases
+static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS],
+ x_filters[x_q4 & SUBPEL_MASK]);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static inline void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ uint8_t v;
+ convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS],
+ x_filters[x_q4 & SUBPEL_MASK]);
+ dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b,
+ uint8x16_t c, uint8x16_t d,
+ uint8x16_t e, uint8x16_t f,
+ uint8x16_t g, uint8x16_t h) {
+ uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b);
+ uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d);
+ uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f);
+ uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h);
+
+ uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd);
+ uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh);
+
+ return (uint8x16_t)vec_mergeh(abcd, efgh);
+}
+
+static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y,
+ ptrdiff_t src_stride,
+ const int16_t *const y_filter) {
+ uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride);
+ uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride);
+ uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride);
+ uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride);
+ uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride);
+ uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride);
+ uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride);
+ uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride);
+ const int16x8_t f = vec_vsx_ld(0, y_filter);
+ uint8_t buf[16];
+ const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7);
+
+ vec_vsx_st(s, 0, buf);
+
+ convolve_line(dst, unpack_to_s16_h(s), f);
+}
+
+static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ convolve_line_v(dst + y * dst_stride,
+ &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
+ y_filters[y_q4 & SUBPEL_MASK]);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ uint8_t v;
+ convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
+ y_filters[y_q4 & SUBPEL_MASK]);
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters, int x0_q4,
+ int x_step_q4, const InterpKernel *const y_filters,
+ int y0_q4, int y_step_q4, int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+ x_filters, x0_q4, x_step_q4, w, intermediate_height);
+ convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+ y_filters, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ (void)filter_y;
+ (void)y_step_q4;
+
+ convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+ w, h);
+}
+
+void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ (void)filter_y;
+ (void)y_step_q4;
+
+ convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, w, h);
+}
+
+void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ (void)filter_x;
+ (void)x_step_q4;
+
+ convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
+ w, h);
+}
+
+void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ (void)filter_x;
+ (void)x_step_q4;
+
+ convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+ y_step_q4, w, h);
+}
+
+void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x,
+ int x_step_q4, const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+ filters_y, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ // Fixed size intermediate buffer places limits on parameters.
+ DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+ assert(w <= 64);
+ assert(h <= 64);
+
+ vpx_convolve8_vsx(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
+ y_step_q4, w, h);
+ vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c
index c80ef729bff..6ceb37e430b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c
@@ -39,7 +39,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
const uint8_t *second_pred) { \
- uint8_t comp_pred[m * n]; \
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \
vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
return sad(src, src_stride, comp_pred, m, m, n); \
}
@@ -178,7 +178,7 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
unsigned int vpx_highbd_sad##m##x##n##_avg_c( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
const uint8_t *second_pred) { \
- uint16_t comp_pred[m * n]; \
+ DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]); \
vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c
index 4214150251f..b1744047af1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/variance.c
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
+
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
@@ -224,6 +226,9 @@ MSE(8, 8)
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride) {
int i, j;
+ /* comp_pred and pred must be 16 byte aligned. */
+ assert(((intptr_t)comp_pred & 0xf) == 0);
+ assert(((intptr_t)pred & 0xf) == 0);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c
index cab6368e606..02c5a955a76 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c
@@ -319,13 +319,11 @@ void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
}
#if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const InterpKernel *x_filters, int x0_q4,
int x_step_q4, int w, int h, int bd) {
int x, y;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
@@ -343,13 +341,11 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
}
}
-static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const InterpKernel *x_filters, int x0_q4,
int x_step_q4, int w, int h, int bd) {
int x, y;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
@@ -369,13 +365,11 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
}
}
-static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const InterpKernel *y_filters, int y0_q4,
int y_step_q4, int w, int h, int bd) {
int x, y;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
@@ -395,13 +389,11 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
}
}
-static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const InterpKernel *y_filters, int y0_q4,
int y_step_q4, int w, int h, int bd) {
int x, y;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
@@ -423,8 +415,8 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
}
}
-static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
+static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const InterpKernel *const x_filters, int x0_q4,
int x_step_q4, const InterpKernel *const y_filters,
int y0_q4, int y_step_q4, int w, int h, int bd) {
@@ -450,15 +442,14 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
assert(x_step_q4 <= 32);
highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
- CONVERT_TO_BYTEPTR(temp), 64, x_filters, x0_q4,
- x_step_q4, w, intermediate_height, bd);
- highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
- 64, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h,
- bd);
+ temp, 64, x_filters, x0_q4, x_step_q4, w,
+ intermediate_height, bd);
+ highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+ y_filters, y0_q4, y_step_q4, w, h, bd);
}
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h, int bd) {
@@ -472,8 +463,8 @@ void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
x_step_q4, w, h, bd);
}
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
@@ -487,8 +478,8 @@ void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
x_step_q4, w, h, bd);
}
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h, int bd) {
@@ -502,8 +493,8 @@ void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
y_step_q4, w, h, bd);
}
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
@@ -517,8 +508,8 @@ void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
y_step_q4, w, h, bd);
}
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h, int bd) {
@@ -531,8 +522,8 @@ void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
filters_y, y0_q4, y_step_q4, w, h, bd);
}
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h, int bd) {
@@ -541,20 +532,18 @@ void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
assert(w <= 64);
assert(h <= 64);
- vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
- vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL,
- 0, NULL, 0, w, h, bd);
+ vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h, bd);
+ vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h,
+ bd);
}
-void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
int r;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_x_stride;
@@ -569,14 +558,12 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
}
}
-void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
int x, y;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_x_stride;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.h
index ee9744b3ae0..1aedd32bd4b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.h
@@ -24,8 +24,8 @@ typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
int h);
#if CONFIG_VP9_HIGHBITDEPTH
-typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
+typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
index 73c50fd3dd9..6ac7182abde 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
@@ -51,6 +51,7 @@ DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm
@@ -95,6 +96,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c
@@ -142,6 +144,8 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_horiz_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c
+DSP_SRCS-$(HAVE_VSX) += ppc/vpx_convolve_vsx.c
+
# loop filters
DSP_SRCS-yes += loopfilter.c
@@ -189,6 +193,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm
endif
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
+DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c
@@ -227,6 +232,11 @@ DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_34_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_135_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_1024_add_neon.c
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_inv_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c
endif # !CONFIG_VP9_HIGHBITDEPTH
ifeq ($(HAVE_NEON_ASM),yes)
@@ -302,6 +312,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm
+DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c
+
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
@@ -320,9 +332,11 @@ DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c
DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
+DSP_SRCS-$(HAVE_VSX) += ppc/variance_vsx.c
ifeq ($(ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm
@@ -339,6 +353,7 @@ endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
# Neon utilities
+DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h
DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h
# PPC VSX utilities
@@ -346,6 +361,9 @@ DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h
DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h
DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h
+# X86 utilities
+DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
+
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
DSP_SRCS-yes += vpx_dsp_rtcd.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 5c2ba1cc541..410055077c5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -39,7 +39,7 @@ specialize qw/vpx_d63_predictor_4x4 ssse3/;
add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2/;
+specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2 vsx/;
add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
@@ -57,7 +57,7 @@ specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2/;
+specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2 vsx/;
add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/;
@@ -75,13 +75,13 @@ add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co
specialize qw/vpx_d207_predictor_8x8 ssse3/;
add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_8x8 neon sse2/;
+specialize qw/vpx_d45_predictor_8x8 neon sse2 vsx/;
add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_8x8 ssse3/;
+specialize qw/vpx_d63_predictor_8x8 ssse3 vsx/;
add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
+specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2 vsx/;
add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
@@ -95,10 +95,10 @@ add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const
specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2/;
+specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2 vsx/;
add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/;
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 vsx/;
add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
@@ -113,13 +113,13 @@ add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride,
specialize qw/vpx_d207_predictor_16x16 ssse3/;
add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_16x16 neon ssse3/;
+specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_16x16 ssse3/;
+specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/;
add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2/;
+specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
@@ -130,34 +130,34 @@ add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride,
specialize qw/vpx_d153_predictor_16x16 ssse3/;
add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_16x16 neon msa sse2/;
+specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;
add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_16x16 neon msa sse2/;
+specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;
add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2/;
+specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/;
add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2/;
+specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;
add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2/;
+specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/;
add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2/;
+specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;
add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d207_predictor_32x32 ssse3/;
add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_32x32 neon ssse3/;
+specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;
add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_32x32 ssse3/;
+specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/;
add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_32x32 neon msa sse2/;
+specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
@@ -168,22 +168,22 @@ add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride,
specialize qw/vpx_d153_predictor_32x32 ssse3/;
add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_32x32 neon msa sse2/;
+specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;
add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_32x32 neon msa sse2/;
+specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/;
add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_32x32 msa neon sse2/;
+specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/;
add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2/;
+specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/;
add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2/;
+specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/;
add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2/;
+specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;
# High bitdepth functions
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -332,28 +332,28 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Sub Pixel Filters
#
add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_copy neon dspr2 msa sse2/;
+specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;
add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa sse2/;
+specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/;
add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa/;
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx/;
add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa/;
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa/;
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/;
+specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa vsx/;
add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/;
add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa vsx/;
add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_scaled_2d ssse3/;
@@ -372,29 +372,29 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# Sub Pixel Filters
#
- add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vpx_highbd_convolve_copy sse2 neon/;
+ add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
- add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vpx_highbd_convolve_avg sse2 neon/;
+ add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
- add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vpx_highbd_convolve8 neon/, "$sse2_x86_64";
+ add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vpx_highbd_convolve8_horiz neon/, "$sse2_x86_64";
+ add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vpx_highbd_convolve8_vert neon/, "$sse2_x86_64";
+ add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vpx_highbd_convolve8_avg neon/, "$sse2_x86_64";
+ add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vpx_highbd_convolve8_avg_horiz neon/, "$sse2_x86_64";
+ add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vpx_highbd_convolve8_avg_vert neon/, "$sse2_x86_64";
+ add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
} # CONFIG_VP9_HIGHBITDEPTH
#
@@ -484,7 +484,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct4x4 sse2/;
+ specialize qw/vpx_fdct4x4 neon sse2/;
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct4x4_1 sse2/;
@@ -532,7 +532,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
} else {
add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct4x4 sse2 msa/;
+ specialize qw/vpx_fdct4x4 neon sse2 msa/;
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct4x4_1 sse2/;
@@ -563,234 +563,106 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# Inverse transform
if (vpx_config("CONFIG_VP9") eq "yes") {
+
+add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+
+if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+ # Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH is off.
+ specialize qw/vpx_idct4x4_16_add neon sse2/;
+ specialize qw/vpx_idct4x4_1_add neon sse2/;
+ specialize qw/vpx_idct8x8_64_add neon sse2 ssse3/;
+ specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
+ specialize qw/vpx_idct8x8_1_add neon sse2/;
+ specialize qw/vpx_idct16x16_256_add neon sse2/;
+ specialize qw/vpx_idct16x16_38_add neon sse2/;
+ $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2;
+ specialize qw/vpx_idct16x16_10_add neon sse2/;
+ specialize qw/vpx_idct16x16_1_add neon sse2/;
+ specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/;
+ specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
+ $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
+ specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
+ specialize qw/vpx_idct32x32_1_add neon sse2/;
+
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+ # Note that these specializations appends to the above ones.
+ specialize qw/vpx_idct4x4_16_add dspr2 msa/;
+ specialize qw/vpx_idct4x4_1_add dspr2 msa/;
+ specialize qw/vpx_idct8x8_64_add dspr2 msa/;
+ specialize qw/vpx_idct8x8_12_add dspr2 msa/;
+ specialize qw/vpx_idct8x8_1_add dspr2 msa/;
+ specialize qw/vpx_idct16x16_256_add dspr2 msa/;
+ specialize qw/vpx_idct16x16_38_add dspr2 msa/;
+ $vpx_idct16x16_38_add_dspr2=vpx_idct16x16_256_add_dspr2;
+ $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa;
+ specialize qw/vpx_idct16x16_10_add dspr2 msa/;
+ specialize qw/vpx_idct16x16_1_add dspr2 msa/;
+ specialize qw/vpx_idct32x32_1024_add dspr2 msa/;
+ specialize qw/vpx_idct32x32_135_add dspr2 msa/;
+ $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
+ $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
+ specialize qw/vpx_idct32x32_34_add dspr2 msa/;
+ specialize qw/vpx_idct32x32_1_add dspr2 msa/;
+ specialize qw/vpx_iwht4x4_16_add msa sse2/;
+ specialize qw/vpx_iwht4x4_1_add msa/;
+ } # !CONFIG_VP9_HIGHBITDEPTH
+} # !CONFIG_EMULATE_HARDWARE
+
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
- add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_iwht4x4_16_add sse2/;
- add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct4x4_1_add neon/;
- add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct8x8_1_add neon/;
- add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct16x16_1_add neon/;
- add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;
- add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- # Force C versions if CONFIG_EMULATE_HARDWARE is 1
- if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
- add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- } else {
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct4x4_16_add neon sse2/;
-
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct4x4_1_add neon sse2/;
-
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct8x8_64_add neon sse2 ssse3/;
-
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
-
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct8x8_1_add neon sse2/;
-
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct16x16_256_add neon sse2/;
-
- add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct16x16_38_add neon sse2/;
- $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2;
-
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct16x16_10_add neon sse2/;
-
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct16x16_1_add neon sse2/;
-
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/;
-
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
- # Need to add 135 eob idct32x32 implementations.
- $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
-
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
+ add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_1_add neon sse2/;
-
- add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
-
- add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct8x8_64_add neon sse2/;
-
- add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct8x8_12_add neon sse2/;
-
- add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct16x16_256_add neon sse2/;
-
- add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct16x16_38_add neon sse2/;
$vpx_highbd_idct16x16_38_add_sse2=vpx_highbd_idct16x16_256_add_sse2;
-
- add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct16x16_10_add neon sse2/;
-
- add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct32x32_1024_add neon/;
-
- add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct32x32_135_add neon/;
-
- add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct32x32_34_add neon/;
- } # CONFIG_EMULATE_HARDWARE
-} else {
- # Force C versions if CONFIG_EMULATE_HARDWARE is 1
- if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
- add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- } else {
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct4x4_1_add sse2 neon dspr2 msa/;
-
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct4x4_16_add sse2 neon dspr2 msa/;
-
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/;
-
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct8x8_64_add sse2 ssse3 neon dspr2 msa/;
-
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct8x8_12_add sse2 ssse3 neon dspr2 msa/;
-
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/;
-
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/;
-
- add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct16x16_38_add sse2 neon dspr2 msa/;
- $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2;
- $vpx_idct16x16_38_add_dspr2=vpx_idct16x16_256_add_dspr2;
- $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa;
-
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
-
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_1024_add sse2 ssse3 neon dspr2 msa/;
-
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_135_add sse2 ssse3 neon dspr2 msa/;
- $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
- $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
- $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
-
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_34_add sse2 ssse3 neon dspr2 msa/;
-
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
-
- add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_iwht4x4_1_add msa/;
-
- add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_iwht4x4_16_add msa sse2/;
- } # CONFIG_EMULATE_HARDWARE
+ } # !CONFIG_EMULATE_HARDWARE
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9
@@ -824,28 +696,28 @@ specialize qw/vpx_subtract_block neon msa sse2/;
# Single block SAD
#
add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 avx2 neon msa sse2/;
+specialize qw/vpx_sad64x64 avx2 neon msa sse2 vsx/;
add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 avx2 msa sse2/;
+specialize qw/vpx_sad64x32 avx2 msa sse2 vsx/;
add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 avx2 msa sse2/;
+specialize qw/vpx_sad32x64 avx2 msa sse2 vsx/;
add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 avx2 neon msa sse2/;
+specialize qw/vpx_sad32x32 avx2 neon msa sse2 vsx/;
add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 avx2 msa sse2/;
+specialize qw/vpx_sad32x16 avx2 msa sse2 vsx/;
add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 msa sse2/;
+specialize qw/vpx_sad16x32 msa sse2 vsx/;
add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 neon msa sse2/;
+specialize qw/vpx_sad16x16 neon msa sse2 vsx/;
add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 neon msa sse2/;
+specialize qw/vpx_sad16x8 neon msa sse2 vsx/;
add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad8x16 neon msa sse2/;
@@ -1249,10 +1121,10 @@ add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int sourc
specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance32x16 sse2 avx2 msa/;
+ specialize qw/vpx_variance32x16 sse2 avx2 neon msa/;
add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance16x32 sse2 msa/;
+ specialize qw/vpx_variance16x32 sse2 neon msa/;
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance16x16 sse2 avx2 neon msa/;
@@ -1267,12 +1139,14 @@ add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_
specialize qw/vpx_variance8x8 sse2 neon msa/;
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance8x4 sse2 msa/;
+ specialize qw/vpx_variance8x4 sse2 neon msa/;
add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+# TODO(johannkoenig): neon
specialize qw/vpx_variance4x8 sse2 msa/;
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+# TODO(johannkoenig): neon
specialize qw/vpx_variance4x4 sse2 msa/;
#
@@ -1297,12 +1171,13 @@ add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stri
specialize qw/vpx_mse8x8 sse2 msa/;
add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
- specialize qw/vpx_get_mb_ss sse2 msa/;
+ specialize qw/vpx_get_mb_ss sse2 msa vsx/;
add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
- specialize qw/vpx_get4x4sse_cs neon msa/;
+ specialize qw/vpx_get4x4sse_cs neon msa vsx/;
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+ specialize qw/vpx_comp_avg_pred sse2 vsx/;
#
# Subpixel Variance
@@ -1311,34 +1186,34 @@ add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int
specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance64x32 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance64x32 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance32x64 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance32x64 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance32x16 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance32x16 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance16x32 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance16x32 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance16x8 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance16x8 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance8x16 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance8x16 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance8x4 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance8x4 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance4x8 msa sse2 ssse3/;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_pred_sse2.c
new file mode 100644
index 00000000000..f83b26490e7
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_pred_sse2.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ /* comp and pred must be 16 byte aligned. */
+ assert(((intptr_t)comp & 0xf) == 0);
+ assert(((intptr_t)pred & 0xf) == 0);
+ if (width > 8) {
+ int x, y;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; x += 16) {
+ const __m128i p = _mm_load_si128((const __m128i *)(pred + x));
+ const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));
+ const __m128i avg = _mm_avg_epu8(p, r);
+ _mm_store_si128((__m128i *)(comp + x), avg);
+ }
+ comp += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else { // width must be 4 or 8.
+ int i;
+ // Process 16 elements at a time. comp and pred have width == stride and
+ // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all
+ // divisible by 16 so just ref needs to be massaged when loading.
+ for (i = 0; i < width * height; i += 16) {
+ const __m128i p = _mm_load_si128((const __m128i *)pred);
+ __m128i r;
+ __m128i avg;
+ if (width == ref_stride) {
+ r = _mm_loadu_si128((const __m128i *)ref);
+ ref += 16;
+ } else if (width == 4) {
+ r = _mm_set_epi32(*(const uint32_t *)(ref + 3 * ref_stride),
+ *(const uint32_t *)(ref + 2 * ref_stride),
+ *(const uint32_t *)(ref + ref_stride),
+ *(const uint32_t *)(ref));
+
+ ref += 4 * ref_stride;
+ } else {
+ const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+ assert(width == 8);
+ r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0),
+ (const __m64 *)(ref + ref_stride)));
+
+ ref += 2 * ref_stride;
+ }
+ avg = _mm_avg_epu8(p, r);
+ _mm_store_si128((__m128i *)comp, avg);
+
+ pred += 16;
+ comp += 16;
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/convolve.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/convolve.h
index d7468ad7ca5..e69d6c61763 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/convolve.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/convolve.h
@@ -103,12 +103,10 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
void vpx_highbd_convolve8_##name##_##opt( \
- const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
if (step_q4 == 16 && filter[3] != 128) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
if (filter[0] | filter[1] | filter[2]) { \
while (w >= 16) { \
vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
@@ -156,7 +154,7 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
} \
} \
if (w) { \
- vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+ vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h, bd); \
} \
@@ -164,7 +162,7 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
#define HIGH_FUN_CONV_2D(avg, opt) \
void vpx_highbd_convolve8_##avg##opt( \
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
assert(w <= 64); \
@@ -172,20 +170,20 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
if (x_step_q4 == 16 && y_step_q4 == 16) { \
if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
- vpx_highbd_convolve8_horiz_##opt( \
- src - 3 * src_stride, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, w, h + 7, bd); \
+ vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+ fdata2, 64, filter_x, x_step_q4, \
+ filter_y, y_step_q4, w, h + 7, bd); \
vpx_highbd_convolve8_##avg##vert_##opt( \
- CONVERT_TO_BYTEPTR(fdata2) + 192, 64, dst, dst_stride, filter_x, \
- x_step_q4, filter_y, y_step_q4, w, h, bd); \
+ fdata2 + 192, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h, bd); \
} else { \
DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
- vpx_highbd_convolve8_horiz_##opt( \
- src, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, filter_x, \
- x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \
- vpx_highbd_convolve8_##avg##vert_##opt( \
- CONVERT_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x, \
- x_step_q4, filter_y, y_step_q4, w, h, bd); \
+ vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h + 1, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h, bd); \
} \
} else { \
vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 00000000000..2fc7b74303d
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,1106 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int width, int h, int bd) {
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+ (void)bd;
+
+ assert(width % 4 == 0);
+ if (width > 32) { // width = 64
+ do {
+ const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+ const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+ _mm256_storeu_si256((__m256i *)(dst + 32), p2);
+ _mm256_storeu_si256((__m256i *)(dst + 48), p3);
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (width > 16) { // width = 32
+ do {
+ const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (width > 8) { // width = 16
+ __m256i p0, p1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ src += src_stride;
+ p1 = _mm256_loadu_si256((const __m256i *)src);
+ src += src_stride;
+
+ _mm256_storeu_si256((__m256i *)dst, p0);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else if (width > 4) { // width = 8
+ __m128i p0, p1;
+ do {
+ p0 = _mm_loadu_si128((const __m128i *)src);
+ src += src_stride;
+ p1 = _mm_loadu_si128((const __m128i *)src);
+ src += src_stride;
+
+ _mm_storeu_si128((__m128i *)dst, p0);
+ dst += dst_stride;
+ _mm_storeu_si128((__m128i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else { // width = 4
+ __m128i p0, p1;
+ do {
+ p0 = _mm_loadl_epi64((const __m128i *)src);
+ src += src_stride;
+ p1 = _mm_loadl_epi64((const __m128i *)src);
+ src += src_stride;
+
+ _mm_storel_epi64((__m128i *)dst, p0);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, p1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ }
+}
+
+void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int width, int h, int bd) {
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+ (void)bd;
+
+ assert(width % 4 == 0);
+ if (width > 32) { // width = 64
+ __m256i p0, p1, p2, p3, u0, u1, u2, u3;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+ p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+ src += src_stride;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+ u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
+ u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+ _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
+ _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (width > 16) { // width = 32
+ __m256i p0, p1, u0, u1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ src += src_stride;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (width > 8) { // width = 16
+ __m256i p0, p1, u0, u1;
+ do {
+ p0 = _mm256_loadu_si256((const __m256i *)src);
+ p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm256_loadu_si256((const __m256i *)dst);
+ u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
+
+ _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+ _mm256_storeu_si256((__m256i *)(dst + dst_stride),
+ _mm256_avg_epu16(p1, u1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ } else if (width > 4) { // width = 8
+ __m128i p0, p1, u0, u1;
+ do {
+ p0 = _mm_loadu_si128((const __m128i *)src);
+ p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm_loadu_si128((const __m128i *)dst);
+ u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
+
+ _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
+ _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ } else { // width = 4
+ __m128i p0, p1, u0, u1;
+ do {
+ p0 = _mm_loadl_epi64((const __m128i *)src);
+ p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
+ src += src_stride << 1;
+ u0 = _mm_loadl_epi64((const __m128i *)dst);
+ u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
+
+ _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
+ dst += dst_stride << 1;
+ h -= 2;
+ } while (h > 0);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Horizontal and vertical filtering
+
+#define CONV8_ROUNDING_BITS (7)
+
+static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+ 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+ 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13,
+ 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13 };
+
+static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15,
+ 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15 };
+
+static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
+
+// -----------------------------------------------------------------------------
+// Horizontal Filtering
+
+static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
+ const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
+ const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
+
+ p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6
+ p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7
+ p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4
+ p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5
+}
+
+// Note:
+// Shared by 8x2 and 16x1 block
+static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *x /*x[8]*/) {
+ __m256i pp[8];
+ pack_pixels(s0, pp);
+ pack_pixels(s1, &pp[4]);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
+ x[4] = x[2];
+ x[5] = x[3];
+ x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
+ x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
+}
+
+static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
+ __m256i pp[8];
+ __m256i s0;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ pack_pixels(&s0, pp);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
+}
+
+static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
+ __m256i *x) {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
+ pack_16_pixels(&s0, &s1, x);
+}
+
+static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_pixels(&s0, &s1, x);
+}
+
+// Note:
+// Shared by horizontal and vertical filtering
+static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p0 = _mm256_set1_epi32(0x03020100);
+ const __m256i p1 = _mm256_set1_epi32(0x07060504);
+ const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
+ const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
+ f[0] = _mm256_shuffle_epi8(hh, p0);
+ f[1] = _mm256_shuffle_epi8(hh, p1);
+ f[2] = _mm256_shuffle_epi8(hh, p2);
+ f[3] = _mm256_shuffle_epi8(hh, p3);
+}
+
+static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+ const __m256i *fil /*fil[4]*/,
+ __m256i *y) {
+ __m256i a, a0, a1;
+
+ a0 = _mm256_madd_epi16(fil[0], sig[0]);
+ a1 = _mm256_madd_epi16(fil[3], sig[3]);
+ a = _mm256_add_epi32(a0, a1);
+
+ a0 = _mm256_madd_epi16(fil[1], sig[1]);
+ a1 = _mm256_madd_epi16(fil[2], sig[2]);
+
+ {
+ const __m256i min = _mm256_min_epi32(a0, a1);
+ a = _mm256_add_epi32(a, min);
+ }
+ {
+ const __m256i max = _mm256_max_epi32(a0, a1);
+ a = _mm256_add_epi32(a, max);
+ }
+ {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ a = _mm256_add_epi32(a, rounding);
+ *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
+ }
+}
+
+static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
+ uint16_t *dst) {
+ const __m128i a0 = _mm256_castsi256_si128(*y);
+ const __m128i a1 = _mm256_extractf128_si256(*y, 1);
+ __m128i res = _mm_packus_epi32(a0, a1);
+ res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+ _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static void vpx_highbd_filter_block1d8_h8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_pixels(src_ptr, src_pitch, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ store_8x1_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap horizontal filtering
+
+static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p = _mm256_set1_epi32(0x09080706);
+ f[0] = _mm256_shuffle_epi8(hh, p);
+}
+
+// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
+// the difference is s0/s1 specifies first and second rows or,
+// first 16 samples and 8-sample shifted 16 samples
+static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *sig) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
+ __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
+ __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
+ __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ r1 = _mm256_shuffle_epi8(r1, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
+ sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
+}
+
+static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+ const ptrdiff_t pitch, __m256i *sig) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
+ r0 = _mm256_permutevar8x32_epi32(r0, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
+}
+
+// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
+static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ __m256i x1 = _mm256_madd_epi16(sig[1], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ x1 = _mm256_add_epi32(x1, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+}
+
+static void vpx_highbd_filter_block1d8_h2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_2t_pixels(src_ptr, signal);
+ filter_8x1_2t_pixels(signal, &ff, &res0);
+ store_8x1_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_2t_pixels(src_ptr, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// Vertical Filtering
+
+static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
+ __m256i s1 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
+ __m256i s2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
+ __m256i s3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
+ __m256i s4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
+ __m256i s5 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
+ __m256i s6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
+
+ s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
+ s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
+ s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
+ s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
+ s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
+
+ sig[0] = _mm256_unpacklo_epi16(s0, s1);
+ sig[4] = _mm256_unpackhi_epi16(s0, s1);
+ sig[1] = _mm256_unpacklo_epi16(s2, s3);
+ sig[5] = _mm256_unpackhi_epi16(s2, s3);
+ sig[2] = _mm256_unpacklo_epi16(s4, s5);
+ sig[6] = _mm256_unpackhi_epi16(s4, s5);
+ sig[8] = s6;
+}
+
+static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ __m256i s0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
+ // base + 8th row
+ __m256i s1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
+ __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
+ __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ sig[3] = _mm256_unpacklo_epi16(s2, s3);
+ sig[7] = _mm256_unpackhi_epi16(s2, s3);
+ sig[8] = s1;
+}
+
+static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_8x1_pixels(sig, f, y0);
+ filter_8x1_pixels(&sig[4], f, y1);
+}
+
+static INLINE void update_pixels(__m256i *sig) {
+ int i;
+ for (i = 0; i < 3; ++i) {
+ sig[i] = sig[i + 1];
+ sig[i + 4] = sig[i + 5];
+ }
+}
+
+static void vpx_highbd_filter_block1d8_v8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[9], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_8x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+ filter_8x9_pixels(signal, ff, &res0, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i u0, u1, u2, u3;
+ // load 0-6 rows
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
+ const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
+ const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
+ const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
+ const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
+
+ u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low
+ u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high
+
+ u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low
+ u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high
+
+ sig[0] = _mm256_unpacklo_epi16(u0, u2);
+ sig[4] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[8] = _mm256_unpacklo_epi16(u1, u3);
+ sig[12] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
+ u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
+ u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
+
+ sig[1] = _mm256_unpacklo_epi16(u0, u2);
+ sig[5] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[9] = _mm256_unpacklo_epi16(u1, u3);
+ sig[13] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
+ u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
+ u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
+
+ sig[2] = _mm256_unpacklo_epi16(u0, u2);
+ sig[6] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[10] = _mm256_unpacklo_epi16(u1, u3);
+ sig[14] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s6;
+}
+
+static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
+ // base + 8th row
+ const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
+
+ __m256i u0, u1, u2, u3;
+ u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
+ u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
+ u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
+
+ sig[3] = _mm256_unpacklo_epi16(u0, u2);
+ sig[7] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[11] = _mm256_unpacklo_epi16(u1, u3);
+ sig[15] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s8;
+}
+
+static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ __m256i res[4];
+ int i;
+ for (i = 0; i < 4; ++i) {
+ filter_8x1_pixels(&sig[i << 2], f, &res[i]);
+ }
+
+ {
+ const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
+ const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
+ *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
+ *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
+ }
+}
+
+static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i p = _mm256_min_epi16(*y0, *mask);
+ _mm256_storeu_si256((__m256i *)dst, p);
+ p = _mm256_min_epi16(*y1, *mask);
+ _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static void update_16x9_pixels(__m256i *sig) {
+ update_pixels(&sig[0]);
+ update_pixels(&sig[8]);
+}
+
+static void vpx_highbd_filter_block1d16_v8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[17], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_16x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_16x9_pixels(src_ptr, src_pitch, signal);
+ filter_16x9_pixels(signal, ff, &res0, &res1);
+ store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_16x9_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap vertical filtering
+
+static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
+ sig[2] = _mm256_loadu_si256((const __m256i *)src);
+}
+
+static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // load the next row
+ const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ sig[0] = _mm256_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm256_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static void vpx_highbd_filter_block1d16_v2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[3], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+ __m256i ff;
+
+ pack_2t_filter(filter, &ff);
+ pack_16x2_init(src_ptr, signal);
+
+ do {
+ pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i p = _mm_set1_epi32(0x09080706);
+ f[0] = _mm_shuffle_epi8(h, p);
+}
+
+static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
+ sig[2] = _mm_loadu_si128((const __m128i *)src);
+}
+
+static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+ __m128i *sig) {
+ // load the next row
+ const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
+ sig[0] = _mm_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+ __m128i *y0, __m128i *y1) {
+ const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m128i x0 = _mm_madd_epi16(sig[0], *f);
+ __m128i x1 = _mm_madd_epi16(sig[1], *f);
+ x0 = _mm_add_epi32(x0, rounding);
+ x1 = _mm_add_epi32(x1, rounding);
+ *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+ const __m128i *mask, uint16_t *dst) {
+ __m128i res = _mm_packus_epi32(*y0, *y1);
+ res = _mm_min_epi16(res, *mask);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void vpx_highbd_filter_block1d8_v2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m128i signal[3], res0, res1;
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ __m128i ff;
+
+ pack_8x1_2t_filter(filter, &ff);
+ pack_8x2_init(src_ptr, signal);
+
+ do {
+ pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+ filter_8_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+// Calculation with averaging the input pixels
+
+static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
+ uint16_t *dst) {
+ const __m128i a0 = _mm256_castsi256_si128(*y0);
+ const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
+ __m128i res = _mm_packus_epi32(a0, a1);
+ const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+ res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+ res = _mm_avg_epu16(res, pix);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
+ const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
+ const __m256i pix =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+ a = _mm256_min_epi16(a, *mask);
+ a = _mm256_avg_epu16(a, pix);
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+ _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
+ a = _mm256_min_epi16(a, *mask);
+ a = _mm256_avg_epu16(a, pix);
+ _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
+ const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
+ __m256i p = _mm256_min_epi16(*y0, *mask);
+ p = _mm256_avg_epu16(p, pix0);
+ _mm256_storeu_si256((__m256i *)dst, p);
+
+ p = _mm256_min_epi16(*y1, *mask);
+ p = _mm256_avg_epu16(p, pix1);
+ _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
+ const __m128i *y1,
+ const __m128i *mask,
+ uint16_t *dst) {
+ __m128i res = _mm_packus_epi32(*y0, *y1);
+ const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+ res = _mm_min_epi16(res, *mask);
+ res = _mm_avg_epu16(res, pix);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void vpx_highbd_filter_block1d8_h8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_pixels(src_ptr, src_pitch, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ store_8x1_avg_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_v8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[9], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_8x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+ filter_8x9_pixels(signal, ff, &res0, &res1);
+ store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d16_v8_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[17], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_16x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_16x9_pixels(src_ptr, src_pitch, signal);
+ filter_16x9_pixels(signal, ff, &res0, &res1);
+ store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_16x9_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_h2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_2t_pixels(src_ptr, signal);
+ filter_8x1_2t_pixels(signal, &ff, &res0);
+ store_8x1_avg_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void vpx_highbd_filter_block1d16_h2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_2t_pixels(src_ptr, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d16_v2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[3], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+ __m256i ff;
+
+ pack_2t_filter(filter, &ff);
+ pack_16x2_init(src_ptr, signal);
+
+ do {
+ pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_v2_avg_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m128i signal[3], res0, res1;
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ __m128i ff;
+
+ pack_8x1_2t_filter(filter, &ff);
+ pack_8x2_init(src_ptr, signal);
+
+ do {
+ pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+ filter_8_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
+#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
+#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
+#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
+
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+HIGH_FUN_CONV_2D(, avx2);
+
+void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
+ uint16_t *, ptrdiff_t, uint32_t,
+ const int16_t *, int);
+void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t,
+ uint16_t *, ptrdiff_t, uint32_t,
+ const int16_t *, int);
+void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t,
+ uint16_t *, ptrdiff_t, uint32_t,
+ const int16_t *, int);
+void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
+ uint16_t *, ptrdiff_t, uint32_t,
+ const int16_t *, int);
+#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
+ vpx_highbd_filter_block1d4_h8_avg_sse2
+#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
+ vpx_highbd_filter_block1d4_h2_avg_sse2
+#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
+ vpx_highbd_filter_block1d4_v8_avg_sse2
+#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
+ vpx_highbd_filter_block1d4_v2_avg_sse2
+
+HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2);
+HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+ avx2);
+HIGH_FUN_CONV_2D(avg_, avx2);
+
+#undef HIGHBD_FUNC
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
new file mode 100644
index 00000000000..f16e4d07186
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[32];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i rounding = _mm_set1_epi16(32);
+ const __m128i max = _mm_set1_epi16(3155);
+ const __m128i min = _mm_set1_epi16(-3155);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 16; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+ inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 32; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ idct16_sse2(inptr, inptr + 16);
+
+ // Find the min & max for the column transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 32; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ array_transpose_16x16(inptr, inptr + 16);
+ for (i = 0; i < 16; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+ sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 16; ++i) {
+ vpx_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+ }
+
+ if (optimised_cols) {
+ idct16_sse2(inptr, inptr + 16);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[2];
+ for (i = 0; i < 16; i++) {
+ inptr[i] = _mm_add_epi16(inptr[i], rounding);
+ inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
+ d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+ d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
+ inptr[i] = _mm_srai_epi16(inptr[i], 6);
+ inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
+ d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
+ d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
+ _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[16], temp_out[16];
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ vpx_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+ }
+}
+
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[32];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i rounding = _mm_set1_epi16(32);
+ const __m128i max = _mm_set1_epi16(3155);
+ const __m128i min = _mm_set1_epi16(-3155);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 16; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+ inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ // Since all non-zero dct coefficients are in upper-left 4x4 area,
+ // we only need to consider first 4 rows here.
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 4; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform (N.B. This transposes inptr)
+ idct16_sse2(inptr, inptr + 16);
+
+ // Find the min & max for the column transform
+ // N.B. Only first 4 cols contain non-zero coeffs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 16; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ // Use fact only first 4 rows contain non-zero coeffs
+ array_transpose_8x8(inptr, inptr);
+ array_transpose_8x8(inptr + 8, inptr + 16);
+ for (i = 0; i < 4; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+ sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+ temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+ temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vpx_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+ }
+
+ if (optimised_cols) {
+ idct16_sse2(inptr, inptr + 16);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[2];
+ for (i = 0; i < 16; i++) {
+ inptr[i] = _mm_add_epi16(inptr[i], rounding);
+ inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
+ d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+ d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
+ inptr[i] = _mm_srai_epi16(inptr[i], 6);
+ inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
+ d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
+ d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
+ _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[16], temp_out[16];
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ vpx_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
new file mode 100644
index 00000000000..bc9debf319c
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ __m128i dc_value, d;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ int a, i, j;
+ tran_low_t out;
+
+ out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ a = ROUND_POWER_OF_TWO(out, 6);
+
+ d = _mm_set1_epi32(a);
+ dc_value = _mm_packs_epi32(d, d);
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 4; ++j) {
+ d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
+ d = _mm_adds_epi16(d, dc_value);
+ d = _mm_max_epi16(d, zero);
+ d = _mm_min_epi16(d, max);
+ _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
+ }
+ dest += stride;
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
new file mode 100644
index 00000000000..3949ce92f89
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ __m128i inptr[4];
+ __m128i sign_bits[2];
+ __m128i temp_mm, min_input, max_input;
+ int test;
+ int optimised_cols = 0;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i max = _mm_set1_epi16(12043);
+ const __m128i min = _mm_set1_epi16(-12043);
+ // Load input into __m128i
+ inptr[0] = _mm_loadu_si128((const __m128i *)input);
+ inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+ inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+ inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+ // Pack to 16 bits
+ inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+ inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp_mm = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp_mm);
+
+ if (!test) {
+ // Do the row transform
+ idct4_sse2(inptr);
+
+ // Check the min & max values
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp_mm = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp_mm);
+
+ if (test) {
+ transpose_16bit_4x4(inptr);
+ sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+ sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+ inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+ inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+ inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+ inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+ _mm_storeu_si128((__m128i *)outptr, inptr[0]);
+ _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
+ _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
+ _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vpx_highbd_idct4_c(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+ }
+
+ if (optimised_cols) {
+ idct4_sse2(inptr);
+
+ // Final round and shift
+ inptr[0] = _mm_add_epi16(inptr[0], eight);
+ inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+ inptr[0] = _mm_srai_epi16(inptr[0], 4);
+ inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+ __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi64(
+ d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+ d2 = _mm_unpacklo_epi64(
+ d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+ d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+ d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+ // store input0
+ _mm_storel_epi64((__m128i *)dest, d0);
+ // store input1
+ d0 = _mm_srli_si128(d0, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride), d0);
+ // store input2
+ _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+ // store input3
+ d2 = _mm_srli_si128(d2, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[4], temp_out[4];
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
+ vpx_highbd_idct4_c(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
new file mode 100644
index 00000000000..6a2e180646c
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[8];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ const __m128i max = _mm_set1_epi16(6201);
+ const __m128i min = _mm_set1_epi16(-6201);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 8; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ idct8_sse2(inptr);
+
+ // Find the min & max for the column transform
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ array_transpose_8x8(inptr, inptr);
+ for (i = 0; i < 8; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 8; ++i) {
+ vpx_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ }
+
+ if (optimised_cols) {
+ idct8_sse2(inptr);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[8];
+ for (i = 0; i < 8; i++) {
+ inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+ d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+ inptr[i] = _mm_srai_epi16(inptr[i], 5);
+ d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[8], temp_out[8];
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+ vpx_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+ }
+}
+
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j, test;
+ __m128i inptr[8];
+ __m128i min_input, max_input, temp1, temp2, sign_bits;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ const __m128i max = _mm_set1_epi16(6201);
+ const __m128i min = _mm_set1_epi16(-6201);
+ int optimised_cols = 0;
+
+ // Load input into __m128i & pack to 16 bits
+ for (i = 0; i < 8; i++) {
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+ inptr[i] = _mm_packs_epi32(temp1, temp2);
+ }
+
+ // Find the min & max for the row transform
+ // only first 4 row has non-zero coefs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 4; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (!test) {
+ // Do the row transform
+ idct8_sse2(inptr);
+
+ // Find the min & max for the column transform
+ // N.B. Only first 4 cols contain non-zero coeffs
+ max_input = _mm_max_epi16(inptr[0], inptr[1]);
+ min_input = _mm_min_epi16(inptr[0], inptr[1]);
+ for (i = 2; i < 8; i++) {
+ max_input = _mm_max_epi16(max_input, inptr[i]);
+ min_input = _mm_min_epi16(min_input, inptr[i]);
+ }
+ max_input = _mm_cmpgt_epi16(max_input, max);
+ min_input = _mm_cmplt_epi16(min_input, min);
+ temp1 = _mm_or_si128(max_input, min_input);
+ test = _mm_movemask_epi8(temp1);
+
+ if (test) {
+ // Use fact only first 4 rows contain non-zero coeffs
+ array_transpose_4X8(inptr, inptr);
+ for (i = 0; i < 4; i++) {
+ sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+ temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+ temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+ }
+ } else {
+ // Set to use the optimised transform for the column
+ optimised_cols = 1;
+ }
+ } else {
+ // Run the un-optimised row transform
+ for (i = 0; i < 4; ++i) {
+ vpx_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ }
+
+ if (optimised_cols) {
+ idct8_sse2(inptr);
+
+ // Final round & shift and Reconstruction and Store
+ {
+ __m128i d[8];
+ for (i = 0; i < 8; i++) {
+ inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+ d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+ inptr[i] = _mm_srai_epi16(inptr[i], 5);
+ d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+ // Store
+ _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
+ }
+ }
+ } else {
+ // Run the un-optimised column transform
+ tran_low_t temp_in[8], temp_out[8];
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+ vpx_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
new file mode 100644
index 00000000000..774cce1d40c
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+ __m128i ubounded, retval;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ ubounded = _mm_cmpgt_epi16(value, max);
+ retval = _mm_andnot_si128(ubounded, value);
+ ubounded = _mm_and_si128(ubounded, max);
+ retval = _mm_or_si128(retval, ubounded);
+ retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+ return retval;
+}
+
+#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index 8c33caedbd8..f75dab07aed 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -10,153 +10,36 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
-#define RECON_AND_STORE4X4(dest, in_x) \
- { \
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- d0 = _mm_add_epi16(in_x, d0); \
- d0 = _mm_packus_epi16(d0, d0); \
- *(int *)(dest) = _mm_cvtsi128_si32(d0); \
- }
-
void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
- const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
- const __m128i cst = _mm_setr_epi16(
- (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
- (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
- (int16_t)cospi_8_64, (int16_t)cospi_24_64);
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i input0, input1, input2, input3;
+ __m128i in[2];
// Rows
- input0 = load_input_data(input);
- input2 = load_input_data(input + 8);
-
- // Construct i3, i1, i3, i1, i2, i0, i2, i0
- input0 = _mm_shufflelo_epi16(input0, 0xd8);
- input0 = _mm_shufflehi_epi16(input0, 0xd8);
- input2 = _mm_shufflelo_epi16(input2, 0xd8);
- input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
- input1 = _mm_unpackhi_epi32(input0, input0);
- input0 = _mm_unpacklo_epi32(input0, input0);
- input3 = _mm_unpackhi_epi32(input2, input2);
- input2 = _mm_unpacklo_epi32(input2, input2);
-
- // Stage 1
- input0 = _mm_madd_epi16(input0, cst);
- input1 = _mm_madd_epi16(input1, cst);
- input2 = _mm_madd_epi16(input2, cst);
- input3 = _mm_madd_epi16(input3, cst);
-
- input0 = _mm_add_epi32(input0, rounding);
- input1 = _mm_add_epi32(input1, rounding);
- input2 = _mm_add_epi32(input2, rounding);
- input3 = _mm_add_epi32(input3, rounding);
-
- input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
- input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
- input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
- input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
- // Stage 2
- input0 = _mm_packs_epi32(input0, input1);
- input1 = _mm_packs_epi32(input2, input3);
-
- // Transpose
- input2 = _mm_unpacklo_epi16(input0, input1);
- input3 = _mm_unpackhi_epi16(input0, input1);
- input0 = _mm_unpacklo_epi32(input2, input3);
- input1 = _mm_unpackhi_epi32(input2, input3);
-
- // Switch column2, column 3, and then, we got:
- // input2: column1, column 0; input3: column2, column 3.
- input1 = _mm_shuffle_epi32(input1, 0x4e);
- input2 = _mm_add_epi16(input0, input1);
- input3 = _mm_sub_epi16(input0, input1);
+ in[0] = load_input_data(input);
+ in[1] = load_input_data(input + 8);
+ idct4_sse2(in);
// Columns
- // Construct i3, i1, i3, i1, i2, i0, i2, i0
- input0 = _mm_unpacklo_epi32(input2, input2);
- input1 = _mm_unpackhi_epi32(input2, input2);
- input2 = _mm_unpackhi_epi32(input3, input3);
- input3 = _mm_unpacklo_epi32(input3, input3);
-
- // Stage 1
- input0 = _mm_madd_epi16(input0, cst);
- input1 = _mm_madd_epi16(input1, cst);
- input2 = _mm_madd_epi16(input2, cst);
- input3 = _mm_madd_epi16(input3, cst);
-
- input0 = _mm_add_epi32(input0, rounding);
- input1 = _mm_add_epi32(input1, rounding);
- input2 = _mm_add_epi32(input2, rounding);
- input3 = _mm_add_epi32(input3, rounding);
-
- input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
- input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
- input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
- input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
- // Stage 2
- input0 = _mm_packs_epi32(input0, input2);
- input1 = _mm_packs_epi32(input1, input3);
-
- // Transpose
- input2 = _mm_unpacklo_epi16(input0, input1);
- input3 = _mm_unpackhi_epi16(input0, input1);
- input0 = _mm_unpacklo_epi32(input2, input3);
- input1 = _mm_unpackhi_epi32(input2, input3);
-
- // Switch column2, column 3, and then, we got:
- // input2: column1, column 0; input3: column2, column 3.
- input1 = _mm_shuffle_epi32(input1, 0x4e);
- input2 = _mm_add_epi16(input0, input1);
- input3 = _mm_sub_epi16(input0, input1);
+ idct4_sse2(in);
// Final round and shift
- input2 = _mm_add_epi16(input2, eight);
- input3 = _mm_add_epi16(input3, eight);
-
- input2 = _mm_srai_epi16(input2, 4);
- input3 = _mm_srai_epi16(input3, 4);
+ in[0] = _mm_add_epi16(in[0], eight);
+ in[1] = _mm_add_epi16(in[1], eight);
+ in[0] = _mm_srai_epi16(in[0], 4);
+ in[1] = _mm_srai_epi16(in[1], 4);
- // Reconstruction and Store
- {
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
- __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi32(d0,
- _mm_cvtsi32_si128(*(const int *)(dest + stride)));
- d2 = _mm_unpacklo_epi32(
- _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
- d0 = _mm_unpacklo_epi8(d0, zero);
- d2 = _mm_unpacklo_epi8(d2, zero);
- d0 = _mm_add_epi16(d0, input2);
- d2 = _mm_add_epi16(d2, input3);
- d0 = _mm_packus_epi16(d0, d2);
- // store input0
- *(int *)dest = _mm_cvtsi128_si32(d0);
- // store input1
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
- // store input2
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
- // store input3
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
- }
+ recon_and_store4x4_sse2(in, dest, stride);
}
void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
- __m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a;
+ __m128i dc_value, d[2];
a = (int)dct_const_round_shift(input[0] * cospi_16_64);
a = (int)dct_const_round_shift(a * cospi_16_64);
@@ -164,18 +47,26 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
dc_value = _mm_set1_epi16(a);
- RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
- const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
- res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
- res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+ // Reconstruction and Store
+ d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+ d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+ d[0] = _mm_unpacklo_epi32(d[0],
+ _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+ d[1] = _mm_unpacklo_epi32(
+ _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+ d[0] = _mm_unpacklo_epi8(d[0], zero);
+ d[1] = _mm_unpacklo_epi8(d[1], zero);
+ d[0] = _mm_add_epi16(d[0], dc_value);
+ d[1] = _mm_add_epi16(d[1], dc_value);
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+
+ *(int *)dest = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
}
void idct4_sse2(__m128i *in) {
@@ -186,7 +77,7 @@ void idct4_sse2(__m128i *in) {
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8];
- transpose_4x4(in);
+ transpose_16bit_4x4(in);
// stage 1
u[0] = _mm_unpacklo_epi16(in[0], in[1]);
u[1] = _mm_unpackhi_epi16(in[0], in[1]);
@@ -224,7 +115,7 @@ void iadst4_sse2(__m128i *in) {
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8], in7;
- transpose_4x4(in);
+ transpose_16bit_4x4(in);
in7 = _mm_srli_si128(in[1], 8);
in7 = _mm_add_epi16(in7, in[0]);
in7 = _mm_sub_epi16(in7, in[1]);
@@ -3349,595 +3240,3 @@ void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
RECON_AND_STORE(dest + 24 + j * stride, dc_value);
}
}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
- __m128i ubounded, retval;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
- ubounded = _mm_cmpgt_epi16(value, max);
- retval = _mm_andnot_si128(ubounded, value);
- ubounded = _mm_and_si128(ubounded, max);
- retval = _mm_or_si128(retval, ubounded);
- retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
- return retval;
-}
-
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- __m128i inptr[4];
- __m128i sign_bits[2];
- __m128i temp_mm, min_input, max_input;
- int test;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- int optimised_cols = 0;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i max = _mm_set1_epi16(12043);
- const __m128i min = _mm_set1_epi16(-12043);
- // Load input into __m128i
- inptr[0] = _mm_loadu_si128((const __m128i *)input);
- inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
- inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
- inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
- // Pack to 16 bits
- inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
- inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (!test) {
- // Do the row transform
- idct4_sse2(inptr);
-
- // Check the min & max values
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (test) {
- transpose_4x4(inptr);
- sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
- sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
- inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
- inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
- inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
- inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
- _mm_storeu_si128((__m128i *)outptr, inptr[0]);
- _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
- _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
- _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct4_c(input, outptr, bd);
- input += 4;
- outptr += 4;
- }
- }
-
- if (optimised_cols) {
- idct4_sse2(inptr);
-
- // Final round and shift
- inptr[0] = _mm_add_epi16(inptr[0], eight);
- inptr[1] = _mm_add_epi16(inptr[1], eight);
-
- inptr[0] = _mm_srai_epi16(inptr[0], 4);
- inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
- // Reconstruction and Store
- {
- __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
- __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi64(
- d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
- d2 = _mm_unpacklo_epi64(
- d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
- d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
- d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
- // store input0
- _mm_storel_epi64((__m128i *)dest, d0);
- // store input1
- d0 = _mm_srli_si128(d0, 8);
- _mm_storel_epi64((__m128i *)(dest + stride), d0);
- // store input2
- _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
- // store input3
- d2 = _mm_srli_si128(d2, 8);
- _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[4], temp_out[4];
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
- vpx_highbd_idct4_c(temp_in, temp_out, bd);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
- }
- }
- }
-}
-
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[8];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i sixteen = _mm_set1_epi16(16);
- const __m128i max = _mm_set1_epi16(6201);
- const __m128i min = _mm_set1_epi16(-6201);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 8; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- idct8_sse2(inptr);
-
- // Find the min & max for the column transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- array_transpose_8x8(inptr, inptr);
- for (i = 0; i < 8; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 8; ++i) {
- vpx_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- }
-
- if (optimised_cols) {
- idct8_sse2(inptr);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[8];
- for (i = 0; i < 8; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[8], temp_out[8];
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- vpx_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
- }
-}
-
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[8];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i sixteen = _mm_set1_epi16(16);
- const __m128i max = _mm_set1_epi16(6201);
- const __m128i min = _mm_set1_epi16(-6201);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 8; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- // only first 4 row has non-zero coefs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- idct8_sse2(inptr);
-
- // Find the min & max for the column transform
- // N.B. Only first 4 cols contain non-zero coeffs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- // Use fact only first 4 rows contain non-zero coeffs
- array_transpose_4X8(inptr, inptr);
- for (i = 0; i < 4; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- }
-
- if (optimised_cols) {
- idct8_sse2(inptr);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[8];
- for (i = 0; i < 8; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[8], temp_out[8];
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- vpx_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
- }
-}
-
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[32];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_set1_epi16(32);
- const __m128i max = _mm_set1_epi16(3155);
- const __m128i min = _mm_set1_epi16(-3155);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 16; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
- inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 32; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- idct16_sse2(inptr, inptr + 16);
-
- // Find the min & max for the column transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 32; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- array_transpose_16x16(inptr, inptr + 16);
- for (i = 0; i < 16; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
- sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 16; ++i) {
- vpx_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
- }
-
- if (optimised_cols) {
- idct16_sse2(inptr, inptr + 16);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[2];
- for (i = 0; i < 16; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], rounding);
- inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
- inptr[i] = _mm_srai_epi16(inptr[i], 6);
- inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[16], temp_out[16];
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- vpx_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
- }
-}
-
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[32];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_set1_epi16(32);
- const __m128i max = _mm_set1_epi16(3155);
- const __m128i min = _mm_set1_epi16(-3155);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 16; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
- inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- // Since all non-zero dct coefficients are in upper-left 4x4 area,
- // we only need to consider first 4 rows here.
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform (N.B. This transposes inptr)
- idct16_sse2(inptr, inptr + 16);
-
- // Find the min & max for the column transform
- // N.B. Only first 4 cols contain non-zero coeffs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 16; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- // Use fact only first 4 rows contain non-zero coeffs
- array_transpose_8x8(inptr, inptr);
- array_transpose_8x8(inptr + 8, inptr + 16);
- for (i = 0; i < 4; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
- sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
- }
-
- if (optimised_cols) {
- idct16_sse2(inptr, inptr + 16);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[2];
- for (i = 0; i < 16; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], rounding);
- inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
- inptr[i] = _mm_srai_epi16(inptr[i], 6);
- inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[16], temp_out[16];
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- vpx_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
- }
-}
-
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- __m128i dc_value, d;
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
- int a, i, j;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- tran_low_t out;
-
- out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
- a = ROUND_POWER_OF_TWO(out, 6);
-
- d = _mm_set1_epi32(a);
- dc_value = _mm_packs_epi32(d, d);
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 4; ++j) {
- d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
- d = _mm_adds_epi16(d, dc_value);
- d = _mm_max_epi16(d, zero);
- d = _mm_min_epi16(d, max);
- _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
- }
- dest += stride;
- }
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
index d5683ab1cf0..0460ab13bcb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
@@ -279,6 +279,34 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
res3 = _mm_packs_epi32(tmp6, tmp7); \
}
+static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
+ uint8_t *const dest,
+ const int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d[2];
+
+ // Reconstruction and Store
+ d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+ d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+ d[0] = _mm_unpacklo_epi32(d[0],
+ _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+ d[1] = _mm_unpacklo_epi32(
+ _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+ d[0] = _mm_unpacklo_epi8(d[0], zero);
+ d[1] = _mm_unpacklo_epi8(d[1], zero);
+ d[0] = _mm_add_epi16(d[0], in[0]);
+ d[1] = _mm_add_epi16(d[1], in[1]);
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+
+ *(int *)dest = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+ d[0] = _mm_srli_si128(d[0], 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
+}
+
void idct4_sse2(__m128i *in);
void idct8_sse2(__m128i *in);
void idct16_sse2(__m128i *in0, __m128i *in1);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h
new file mode 100644
index 00000000000..a5e40245a09
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/transpose_sse2.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_
+#define VPX_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void transpose_16bit_4x4(__m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+ res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1,
+ __m128i *const a2, __m128i *const a3) {
+ // Unpack 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0: 00 10 01 11
+ // b1: 20 30 21 31
+ // b2: 02 12 03 13
+ // b3: 22 32 23 33
+
+ const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1);
+ const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3);
+ const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1);
+ const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3);
+
+ // Unpack 64 bit elements resulting in:
+ // a0: 00 10 20 30
+ // a1: 01 11 21 31
+ // a2: 02 12 22 32
+ // a3: 03 13 23 33
+ *a0 = _mm_unpacklo_epi64(b0, b1);
+ *a1 = _mm_unpackhi_epi64(b0, b1);
+ *a2 = _mm_unpacklo_epi64(b2, b3);
+ *a3 = _mm_unpackhi_epi64(b2, b3);
+}
+
+#endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index e2311c11670..389a692dbc9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -32,9 +32,7 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
mov r4d, dword wm
%ifidn %2, highbd
shl r4d, 1
- shl srcq, 1
shl src_strideq, 1
- shl dstq, 1
shl dst_strideq, 1
%else
cmp r4d, 4
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h
index 49954e90477..bfef783b133 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h
@@ -35,8 +35,10 @@
(((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
+#define CAST_TO_SHORTPTR(x) ((uint16_t *)((uintptr_t)(x)))
#if CONFIG_VP9_HIGHBITDEPTH
#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
+#define CAST_TO_BYTEPTR(x) ((uint8_t *)((uintptr_t)(x)))
#endif // CONFIG_VP9_HIGHBITDEPTH
#if !defined(__has_feature)
diff --git a/chromium/third_party/libvpx/source/libvpx/vpxdec.c b/chromium/third_party/libvpx/source/libvpx/vpxdec.c
index fa85ac8587c..6db2afb4aec 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpxdec.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpxdec.c
@@ -977,7 +977,7 @@ static int main_loop(int argc, const char **argv_) {
if (do_md5) {
update_image_md5(img, planes, &md5_ctx);
} else {
- write_image_file(img, planes, outfile);
+ if (!corrupted) write_image_file(img, planes, outfile);
}
} else {
generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w,
diff --git a/chromium/third_party/libvpx/source/libvpx/webmdec.cc b/chromium/third_party/libvpx/source/libvpx/webmdec.cc
index ed4bd700dd7..d609075a932 100644
--- a/chromium/third_party/libvpx/source/libvpx/webmdec.cc
+++ b/chromium/third_party/libvpx/source/libvpx/webmdec.cc
@@ -165,10 +165,11 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
}
if (get_new_block) {
block = block_entry->GetBlock();
+ if (block == NULL) return -1;
webm_ctx->block_frame_index = 0;
}
- } while (block->GetTrackNumber() != webm_ctx->video_track_index ||
- block_entry_eos);
+ } while (block_entry_eos ||
+ block->GetTrackNumber() != webm_ctx->video_track_index);
webm_ctx->cluster = cluster;
webm_ctx->block_entry = block_entry;