summaryrefslogtreecommitdiff
path: root/chromium/third_party/libvpx
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2017-04-05 14:08:31 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2017-04-11 07:46:53 +0000
commit6a4cabb866f66d4128a97cdc6d9d08ce074f1247 (patch)
treeab00f70a5e89278d6a0d16ff0c42578dc4d84a2d /chromium/third_party/libvpx
parente733310db58160074f574c429d48f8308c0afe17 (diff)
downloadqtwebengine-chromium-6a4cabb866f66d4128a97cdc6d9d08ce074f1247.tar.gz
BASELINE: Update Chromium to 57.0.2987.144
Change-Id: I29db402ff696c71a04c4dbaec822c2e53efe0267 Reviewed-by: Peter Varga <pvarga@inf.u-szeged.hu>
Diffstat (limited to 'chromium/third_party/libvpx')
-rw-r--r--chromium/third_party/libvpx/BUILD.gn4
-rw-r--r--chromium/third_party/libvpx/README.chromium4
-rw-r--r--chromium/third_party/libvpx/libvpx_srcs.gni9
-rw-r--r--chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h8
-rw-r--r--chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h75
-rw-r--r--chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h8
-rw-r--r--chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c2
-rw-r--r--chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h75
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h12
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h113
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h8
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h75
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h4
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h28
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h8
-rw-r--r--chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h75
-rw-r--r--chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h8
-rw-r--r--chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h54
-rw-r--r--chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h2
-rw-r--r--chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h16
-rw-r--r--chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h138
-rw-r--r--chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h4
-rw-r--r--chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h28
-rw-r--r--chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h4
-rw-r--r--chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h28
-rw-r--r--chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h4
-rw-r--r--chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h12
-rw-r--r--chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h120
-rw-r--r--chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h2
-rw-r--r--chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h16
-rw-r--r--chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h138
-rw-r--r--chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h4
-rw-r--r--chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h12
-rw-r--r--chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h120
-rw-r--r--chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h8
-rw-r--r--chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h54
-rw-r--r--chromium/third_party/libvpx/source/config/vpx_version.h6
-rw-r--r--chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h2
-rw-r--r--chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h16
-rw-r--r--chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h138
-rw-r--r--chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h4
-rw-r--r--chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h12
-rw-r--r--chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h120
-rw-r--r--chromium/third_party/libvpx/source/libvpx/build/make/Android.mk30
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c3
-rw-r--r--chromium/third_party/libvpx/source/libvpx/libs.mk4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/tools_common.h19
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h39
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c60
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c21
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c48
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c34
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c34
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c126
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h9
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl20
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c22
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c127
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c49
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c165
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h40
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c14
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c21
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c40
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c69
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c18
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h13
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c15
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c69
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c485
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c171
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c614
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c1078
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c95
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm375
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c1943
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c72
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c714
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c145
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c23
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c144
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c54
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm31
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c175
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c84
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm60
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c522
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm18
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h424
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c504
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm134
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c23
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h430
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c33
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c1
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h11
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c93
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c550
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c687
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c113
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c148
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h19
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk27
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl225
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm16
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c52
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h5
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpxenc.c4
134 files changed, 8809 insertions, 4338 deletions
diff --git a/chromium/third_party/libvpx/BUILD.gn b/chromium/third_party/libvpx/BUILD.gn
index 0a39291205b..2c79ee0f405 100644
--- a/chromium/third_party/libvpx/BUILD.gn
+++ b/chromium/third_party/libvpx/BUILD.gn
@@ -38,9 +38,9 @@ if (is_nacl) {
# vpx_config.asm
if (is_ios && current_cpu == "arm") {
os_category = current_os
- } else if (is_posix) { # Should cover linux, mac, and the ios simulator.
+ } else if (is_posix) { # Should cover linux, mac, and the ios simulator.
os_category = "linux"
- } else { # This should only match windows.
+ } else { # This should only match windows.
os_category = current_os
}
platform_include_dir =
diff --git a/chromium/third_party/libvpx/README.chromium b/chromium/third_party/libvpx/README.chromium
index 390b58ebdc1..7fcfd85f425 100644
--- a/chromium/third_party/libvpx/README.chromium
+++ b/chromium/third_party/libvpx/README.chromium
@@ -5,9 +5,9 @@ License: BSD
License File: source/libvpx/LICENSE
Security Critical: yes
-Date: Tuesday November 08 2016
+Date: Monday January 09 2017
Branch: master
-Commit: 5c64c01c7ca3780d30f140e54a30088f780ae66a
+Commit: 5b1a8ca5e846f838062becaec9ed6b5ecef306e5
Description:
Contains the sources used to compile libvpx binaries used by Google Chrome and
diff --git a/chromium/third_party/libvpx/libvpx_srcs.gni b/chromium/third_party/libvpx/libvpx_srcs.gni
index 37850e29793..664a0fdbb2e 100644
--- a/chromium/third_party/libvpx/libvpx_srcs.gni
+++ b/chromium/third_party/libvpx/libvpx_srcs.gni
@@ -1515,15 +1515,16 @@ libvpx_srcs_arm_neon = [
"//third_party/libvpx/source/libvpx/vpx/vpx_integer.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c",
- "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c",
@@ -1986,14 +1987,15 @@ libvpx_srcs_arm_neon_cpu_detect_neon = [
"//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c",
"//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c",
- "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c",
@@ -2299,11 +2301,12 @@ libvpx_srcs_arm64 = [
"//third_party/libvpx/source/libvpx/vpx/vpx_integer.h",
"//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c",
- "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c",
+ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c",
"//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c",
diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h
index 5f0e862cbfa..d07bcaa3def 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h
@@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h
index a5c50f21727..c9d867d00cd 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h
@@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon
void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon
void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
@@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_
#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon
void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_neon
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon
void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c
+void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c
+void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
#define vpx_plane_add_noise vpx_plane_add_noise_c
void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c
+void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon
void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vpx_quantize_b vpx_quantize_b_c
diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h
index 5f0e862cbfa..d07bcaa3def 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h
@@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c
index 5f93ebfb676..56a5348abd6 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c
+++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --enable-postproc --enable-multi-res-encoding --enable-temporal-denoising --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384 --enable-realtime-only --disable-install-docs";
+static const char* const cfg = "--target=armv8-linux-gcc --enable-external-build --enable-postproc --enable-multi-res-encoding --enable-temporal-denoising --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384 --enable-realtime-only --disable-install-docs";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h
index a5c50f21727..c9d867d00cd 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h
@@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon
void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon
void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
@@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_
#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon
void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_neon
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon
void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c
+void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c
+void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
#define vpx_plane_add_noise vpx_plane_add_noise_c
void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c
+void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon
void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vpx_quantize_b vpx_quantize_b_c
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
index f7ac2dc300a..789724ffb93 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
@@ -70,13 +70,13 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h
index 0028d86c3ed..2712530f99c 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h
@@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d135_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d135_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
RTCD_EXTERN void (*vpx_d135_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d135_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
@@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_
RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -312,53 +316,53 @@ RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, in
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width);
@@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c
+void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c
+void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
#define vpx_plane_add_noise vpx_plane_add_noise_c
void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c
+void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vpx_quantize_b vpx_quantize_b_c
@@ -861,10 +868,18 @@ static void setup_rtcd_internal(void)
if (flags & HAS_NEON) vpx_convolve_avg = vpx_convolve_avg_neon;
vpx_convolve_copy = vpx_convolve_copy_c;
if (flags & HAS_NEON) vpx_convolve_copy = vpx_convolve_copy_neon;
+ vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_c;
+ if (flags & HAS_NEON) vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_neon;
+ vpx_d135_predictor_32x32 = vpx_d135_predictor_32x32_c;
+ if (flags & HAS_NEON) vpx_d135_predictor_32x32 = vpx_d135_predictor_32x32_neon;
vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_c;
if (flags & HAS_NEON) vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_neon;
+ vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_c;
+ if (flags & HAS_NEON) vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_neon;
vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c;
if (flags & HAS_NEON) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_neon;
+ vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c;
+ if (flags & HAS_NEON) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_neon;
vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_c;
if (flags & HAS_NEON) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_neon;
vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c;
@@ -932,7 +947,7 @@ static void setup_rtcd_internal(void)
vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c;
if (flags & HAS_NEON) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_neon;
vpx_idct32x32_135_add = vpx_idct32x32_135_add_c;
- if (flags & HAS_NEON) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_neon;
+ if (flags & HAS_NEON) vpx_idct32x32_135_add = vpx_idct32x32_135_add_neon;
vpx_idct32x32_1_add = vpx_idct32x32_1_add_c;
if (flags & HAS_NEON) vpx_idct32x32_1_add = vpx_idct32x32_1_add_neon;
vpx_idct32x32_34_add = vpx_idct32x32_34_add_c;
@@ -975,10 +990,16 @@ static void setup_rtcd_internal(void)
if (flags & HAS_NEON) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_neon;
vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c;
if (flags & HAS_NEON) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_neon;
+ vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c;
+ if (flags & HAS_NEON) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_neon;
+ vpx_mbpost_proc_down = vpx_mbpost_proc_down_c;
+ if (flags & HAS_NEON) vpx_mbpost_proc_down = vpx_mbpost_proc_down_neon;
vpx_minmax_8x8 = vpx_minmax_8x8_c;
if (flags & HAS_NEON) vpx_minmax_8x8 = vpx_minmax_8x8_neon;
vpx_mse16x16 = vpx_mse16x16_c;
if (flags & HAS_NEON) vpx_mse16x16 = vpx_mse16x16_neon;
+ vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_c;
+ if (flags & HAS_NEON) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_neon;
vpx_sad16x16 = vpx_sad16x16_c;
if (flags & HAS_NEON) vpx_sad16x16 = vpx_sad16x16_neon;
vpx_sad16x16x4d = vpx_sad16x16x4d_c;
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h
index 5f0e862cbfa..d07bcaa3def 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h
@@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h
index a5c50f21727..c9d867d00cd 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h
@@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon
void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon
void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
@@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_
#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon
void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_neon
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon
void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c
+void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c
+void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
#define vpx_plane_add_noise vpx_plane_add_noise_c
void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c
+void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon
void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vpx_quantize_b vpx_quantize_b_c
diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h
index 206f5e5dba4..cd5726c2e6b 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h
@@ -68,10 +68,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h
index 6aa4b73856e..89b44dc986c 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h
@@ -271,40 +271,40 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff)
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -313,10 +313,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_c
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h
index 5f0e862cbfa..d07bcaa3def 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h
@@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h
index a5c50f21727..c9d867d00cd 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h
@@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon
void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon
void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
@@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_
#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon
void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_neon
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon
void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c
+void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c
+void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
#define vpx_plane_add_noise vpx_plane_add_noise_c
void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c
+void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon
void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vpx_quantize_b vpx_quantize_b_c
diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h
index f0824a37a80..8251c1b5a19 100644
--- a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h
@@ -83,10 +83,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -110,10 +110,10 @@ void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, ui
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h
index 163cf7611a8..f537568dd91 100644
--- a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h
@@ -901,43 +901,43 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1177,40 +1177,40 @@ void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -1219,10 +1219,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_c
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h
index 8b235e876e3..3addf41714b 100644
--- a/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h
@@ -297,7 +297,7 @@ static void setup_rtcd_internal(void)
vp8_mbuverror = vp8_mbuverror_c;
if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2;
vp8_refining_search_sad = vp8_refining_search_sad_c;
- if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
+ if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4;
vp8_regular_quantize_b = vp8_regular_quantize_b_c;
if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h
index 55c229554e3..28b5da86510 100644
--- a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,13 +125,13 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp vp9_quantize_fp_c
diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h
index 5e31286207d..58079fa420e 100644
--- a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h
@@ -1084,49 +1084,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1423,53 +1423,53 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
RTCD_EXTERN void (*vpx_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1479,11 +1479,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1537,11 +1537,11 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -2618,9 +2618,9 @@ static void setup_rtcd_internal(void)
vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c;
if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2;
vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c;
- if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_xmm;
+ if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2;
vpx_mbpost_proc_down = vpx_mbpost_proc_down_c;
- if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_xmm;
+ if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2;
vpx_minmax_8x8 = vpx_minmax_8x8_c;
if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2;
vpx_mse16x16 = vpx_mse16x16_c;
diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h
index 3d80ce20e90..403db512b5d 100644
--- a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h
@@ -68,10 +68,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h
index 3dfc85323ca..8a27f964d1d 100644
--- a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h
@@ -271,40 +271,40 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff)
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -313,10 +313,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_c
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h
index 3d80ce20e90..403db512b5d 100644
--- a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h
@@ -68,10 +68,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h
index 3dfc85323ca..8a27f964d1d 100644
--- a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h
@@ -271,40 +271,40 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff)
void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -313,10 +313,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_c
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h
index c66d7913431..8dcc9eb2b99 100644
--- a/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h
@@ -169,7 +169,7 @@ int vp8_mbuverror_sse2(struct macroblock *mb);
int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sadx4
void vp8_regular_quantize_b_c(struct block *, struct blockd *);
void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
@@ -240,8 +240,6 @@ static void setup_rtcd_internal(void)
vp8_full_search_sad = vp8_full_search_sad_c;
if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
- vp8_refining_search_sad = vp8_refining_search_sad_c;
- if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2;
diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h
index f747ed67d01..072f858e67e 100644
--- a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,12 +125,12 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h
index 1188bb43b56..bcb567d8ec6 100644
--- a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h
@@ -1091,49 +1091,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1430,58 +1430,58 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1491,11 +1491,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_sse2
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1549,12 +1549,12 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_xmm
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_xmm
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h
index 8b235e876e3..3addf41714b 100644
--- a/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h
@@ -297,7 +297,7 @@ static void setup_rtcd_internal(void)
vp8_mbuverror = vp8_mbuverror_c;
if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2;
vp8_refining_search_sad = vp8_refining_search_sad_c;
- if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
+ if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4;
vp8_regular_quantize_b = vp8_regular_quantize_b_c;
if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h
index 55c229554e3..28b5da86510 100644
--- a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,13 +125,13 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp vp9_quantize_fp_c
diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h
index 5e31286207d..58079fa420e 100644
--- a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h
@@ -1084,49 +1084,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1423,53 +1423,53 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
RTCD_EXTERN void (*vpx_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1479,11 +1479,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1537,11 +1537,11 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -2618,9 +2618,9 @@ static void setup_rtcd_internal(void)
vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c;
if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2;
vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c;
- if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_xmm;
+ if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2;
vpx_mbpost_proc_down = vpx_mbpost_proc_down_c;
- if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_xmm;
+ if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2;
vpx_minmax_8x8 = vpx_minmax_8x8_c;
if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2;
vpx_mse16x16 = vpx_mse16x16_c;
diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h
index c66d7913431..8dcc9eb2b99 100644
--- a/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h
@@ -169,7 +169,7 @@ int vp8_mbuverror_sse2(struct macroblock *mb);
int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sadx4
void vp8_regular_quantize_b_c(struct block *, struct blockd *);
void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
@@ -240,8 +240,6 @@ static void setup_rtcd_internal(void)
vp8_full_search_sad = vp8_full_search_sad_c;
if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
- vp8_refining_search_sad = vp8_refining_search_sad_c;
- if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2;
diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h
index f747ed67d01..072f858e67e 100644
--- a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,12 +125,12 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h
index 1188bb43b56..bcb567d8ec6 100644
--- a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h
@@ -1091,49 +1091,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1430,58 +1430,58 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1491,11 +1491,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_sse2
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1549,12 +1549,12 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_xmm
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_xmm
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
diff --git a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h
index f0824a37a80..8251c1b5a19 100644
--- a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h
@@ -83,10 +83,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -110,10 +110,10 @@ void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, ui
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h
index 163cf7611a8..f537568dd91 100644
--- a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h
@@ -901,43 +901,43 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1177,40 +1177,40 @@ void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -1219,10 +1219,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_c
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
diff --git a/chromium/third_party/libvpx/source/config/vpx_version.h b/chromium/third_party/libvpx/source/config/vpx_version.h
index 97666fffaf6..07f046ed1bc 100644
--- a/chromium/third_party/libvpx/source/config/vpx_version.h
+++ b/chromium/third_party/libvpx/source/config/vpx_version.h
@@ -1,7 +1,7 @@
#define VERSION_MAJOR 1
#define VERSION_MINOR 6
#define VERSION_PATCH 0
-#define VERSION_EXTRA "702-g5c64c01"
+#define VERSION_EXTRA "903-g5b1a8ca5e"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.0-702-g5c64c01"
-#define VERSION_STRING " v1.6.0-702-g5c64c01"
+#define VERSION_STRING_NOSP "v1.6.0-903-g5b1a8ca5e"
+#define VERSION_STRING " v1.6.0-903-g5b1a8ca5e"
diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h
index 8b235e876e3..3addf41714b 100644
--- a/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h
@@ -297,7 +297,7 @@ static void setup_rtcd_internal(void)
vp8_mbuverror = vp8_mbuverror_c;
if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2;
vp8_refining_search_sad = vp8_refining_search_sad_c;
- if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
+ if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4;
vp8_regular_quantize_b = vp8_regular_quantize_b_c;
if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h
index 55c229554e3..28b5da86510 100644
--- a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,13 +125,13 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
#define vp9_quantize_fp vp9_quantize_fp_c
diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h
index 5e31286207d..58079fa420e 100644
--- a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h
@@ -1084,49 +1084,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1423,53 +1423,53 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
RTCD_EXTERN void (*vpx_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1479,11 +1479,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1537,11 +1537,11 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -2618,9 +2618,9 @@ static void setup_rtcd_internal(void)
vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c;
if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2;
vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c;
- if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_xmm;
+ if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2;
vpx_mbpost_proc_down = vpx_mbpost_proc_down_c;
- if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_xmm;
+ if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2;
vpx_minmax_8x8 = vpx_minmax_8x8_c;
if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2;
vpx_mse16x16 = vpx_mse16x16_c;
diff --git a/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h
index c66d7913431..8dcc9eb2b99 100644
--- a/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h
@@ -169,7 +169,7 @@ int vp8_mbuverror_sse2(struct macroblock *mb);
int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sadx4
void vp8_regular_quantize_b_c(struct block *, struct blockd *);
void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
@@ -240,8 +240,6 @@ static void setup_rtcd_internal(void)
vp8_full_search_sad = vp8_full_search_sad_c;
if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
- vp8_refining_search_sad = vp8_refining_search_sad_c;
- if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2;
diff --git a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h
index f747ed67d01..072f858e67e 100644
--- a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,12 +125,12 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
#define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h
index 1188bb43b56..bcb567d8ec6 100644
--- a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h
@@ -1091,49 +1091,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1430,58 +1430,58 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1491,11 +1491,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
#define vpx_int_pro_row vpx_int_pro_row_sse2
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1549,12 +1549,12 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_xmm
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_xmm
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2
void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
index 09bdc5d2f70..a88f90056e4 100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
@@ -64,6 +64,9 @@ CONFIG_DIR := $(LOCAL_PATH)/
LIBVPX_PATH := $(LOCAL_PATH)/libvpx
ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
+ifneq ($(V),1)
+ qexec := @
+endif
# Use the makefiles generated by upstream configure to determine which files to
# build. Also set any architecture-specific flags.
@@ -103,8 +106,8 @@ LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)
.PRECIOUS: %.asm.S
$(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm
- @mkdir -p $(dir $@)
- @$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
+ $(qexec)mkdir -p $(dir $@)
+ $(qexec)$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
# For building *_rtcd.h, which have rules in libs.mk
TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
@@ -150,15 +153,27 @@ CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \
LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+ ASM_INCLUDES := vpx_dsp/arm/idct_neon.asm.S
CODEC_SRCS_ASM_NEON = $(foreach v, \
$(CODEC_SRCS_ASM_ARM_ALL),\
$(if $(findstring neon,$(v)),$(v),))
+ CODEC_SRCS_ASM_NEON := $(filter-out $(addprefix %, $(ASM_INCLUDES)), \
+ $(CODEC_SRCS_ASM_NEON))
CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \
$(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \
$(CODEC_SRCS_ASM_NEON))
LOCAL_SRC_FILES += $(patsubst %.S, \
%.S.neon, \
$(CODEC_SRCS_ASM_NEON_ADS2GAS))
+
+ NEON_ASM_TARGETS = $(patsubst %.S, \
+ $(ASM_CNV_PATH)/libvpx/%.S, \
+ $(CODEC_SRCS_ASM_NEON))
+# add a dependency to the full path to the ads2gas output to ensure the
+# includes are converted first.
+ifneq ($(strip $(NEON_ASM_TARGETS)),)
+$(NEON_ASM_TARGETS): $(addprefix $(ASM_CNV_PATH)/libvpx/, $(ASM_INCLUDES))
+endif
endif
LOCAL_CFLAGS += \
@@ -187,7 +202,7 @@ $$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h
$$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h
rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a
-ifneq ($(findstring $(TARGET_ARCH_ABI),$(rtcd_dep_template_CONFIG_ASM_ABIS)),)
+ifneq ($$(findstring $(TARGET_ARCH_ABI),$$(rtcd_dep_template_CONFIG_ASM_ABIS)),)
$$(rtcd_dep_template_SRCS): vpx_config.asm
endif
endef
@@ -197,16 +212,17 @@ $(eval $(call rtcd_dep_template))
.PHONY: clean
clean:
@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
- @$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
- @$(RM) -r $(ASM_CNV_PATH)
- @$(RM) $(CLEAN-OBJS)
+ $(qexec)$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
+ $(qexec)$(RM) -r $(ASM_CNV_PATH)
+ $(qexec)$(RM) $(CLEAN-OBJS)
ifeq ($(ENABLE_SHARED),1)
+ LOCAL_CFLAGS += -fPIC
include $(BUILD_SHARED_LIBRARY)
else
include $(BUILD_STATIC_LIBRARY)
endif
ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
-$(call import-module,cpufeatures)
+$(call import-module,android/cpufeatures)
endif
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
index 65308a0bd0b..0b9663c777b 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -240,9 +240,9 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
cfg->ts_layer_id[1] = 2;
cfg->ts_layer_id[2] = 1;
cfg->ts_layer_id[3] = 2;
- // Use 40/20/40 bit allocation as example.
- cfg->ts_target_bitrate[0] = 0.4f * bitrate;
- cfg->ts_target_bitrate[1] = 0.6f * bitrate;
+ // Use 45/20/35 bit allocation as example.
+ cfg->ts_target_bitrate[0] = 0.45f * bitrate;
+ cfg->ts_target_bitrate[1] = 0.65f * bitrate;
cfg->ts_target_bitrate[2] = bitrate;
/* 0=L, 1=GF, 2=ARF */
@@ -460,7 +460,7 @@ int main(int argc, char **argv) {
// Set the number of threads per encode/spatial layer.
// (1, 1, 1) means no encoder threading.
- cfg[0].g_threads = 2;
+ cfg[0].g_threads = 1;
cfg[1].g_threads = 1;
cfg[2].g_threads = 1;
@@ -507,9 +507,11 @@ int main(int argc, char **argv) {
/* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
/* Enable denoising for the highest-resolution encoder. */
- if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 4))
+ if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1))
die_codec(&codec[0], "Failed to set noise_sensitivity");
- for (i = 1; i < NUM_ENCODERS; i++) {
+ if (vpx_codec_control(&codec[1], VP8E_SET_NOISE_SENSITIVITY, 1))
+ die_codec(&codec[1], "Failed to set noise_sensitivity");
+ for (i = 2; i < NUM_ENCODERS; i++) {
if (vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
die_codec(&codec[i], "Failed to set noise_sensitivity");
}
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
index fa2df7271b2..0e409387b3e 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -679,7 +679,7 @@ int main(int argc, const char **argv) {
}
#if OUTPUT_RC_STATS
// For now, just write temporal layer streams.
- // TODO(wonkap): do spatial by re-writing superframe.
+ // TODO(marpan): do spatial by re-writing superframe.
if (svc_ctx.output_rc_stat) {
for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
char file_name[PATH_MAX];
@@ -770,7 +770,7 @@ int main(int argc, const char **argv) {
cx_pkt->data.frame.sz,
cx_pkt->data.frame.pts);
#if OUTPUT_RC_STATS
- // TODO(marpan/wonkap): Put this (to line728) in separate function.
+ // TODO(marpan): Put this (to line728) in separate function.
if (svc_ctx.output_rc_stat) {
vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
parse_superframe_index(cx_pkt->data.frame.buf,
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
index 752c1baead1..b9069808350 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -702,11 +702,14 @@ int main(int argc, char **argv) {
vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+ vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
} else if (strncmp(encoder->name, "vp9", 3) == 0) {
vpx_svc_extra_cfg_t svc_params;
memset(&svc_params, 0, sizeof(svc_params));
vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+ vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0);
+ vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
diff --git a/chromium/third_party/libvpx/source/libvpx/libs.mk b/chromium/third_party/libvpx/source/libvpx/libs.mk
index f4f48cc1621..e0a2cc097de 100644
--- a/chromium/third_party/libvpx/source/libvpx/libs.mk
+++ b/chromium/third_party/libvpx/source/libvpx/libs.mk
@@ -391,7 +391,7 @@ LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS))
LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX)
LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
$(call enabled,LIBVPX_TEST_DATA))
-libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
+libvpx_test_data_url=https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/$(1)
TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX)
TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
@@ -405,7 +405,7 @@ CLEAN-OBJS += libvpx_test_srcs.txt
$(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
@echo " [DOWNLOAD] $@"
$(qexec)trap 'rm -f $@' INT TERM &&\
- curl -L -o $@ $(call libvpx_test_data_url,$(@F))
+ curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))
testdata:: $(LIBVPX_TEST_DATA)
$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
diff --git a/chromium/third_party/libvpx/source/libvpx/tools_common.h b/chromium/third_party/libvpx/source/libvpx/tools_common.h
index 73ba1bc03ba..c4a48b24de0 100644
--- a/chromium/third_party/libvpx/source/libvpx/tools_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/tools_common.h
@@ -26,11 +26,21 @@
/* MSVS uses _f{seek,tell}i64. */
#define fseeko _fseeki64
#define ftello _ftelli64
+typedef int64_t FileOffset;
#elif defined(_WIN32)
/* MinGW uses f{seek,tell}o64 for large files. */
#define fseeko fseeko64
#define ftello ftello64
-#endif /* _WIN32 */
+typedef off64_t FileOffset;
+#elif CONFIG_OS_SUPPORT
+typedef off_t FileOffset;
+/* Use 32-bit file operations in WebM file format when building ARM
+ * executables (.axf) with RVCT. */
+#else
+#define fseeko fseek
+#define ftello ftell
+typedef long FileOffset /* NOLINT */
+#endif /* CONFIG_OS_SUPPORT */
#if CONFIG_OS_SUPPORT
#if defined(_MSC_VER)
@@ -42,13 +52,6 @@
#endif /* _MSC_VER */
#endif /* CONFIG_OS_SUPPORT */
-/* Use 32-bit file operations in WebM file format when building ARM
- * executables (.axf) with RVCT. */
-#if !CONFIG_OS_SUPPORT
-#define fseeko fseek
-#define ftello ftell
-#endif /* CONFIG_OS_SUPPORT */
-
#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
#ifndef PATH_MAX
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
index 7612024b7d0..2de343419ac 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
@@ -1469,6 +1469,7 @@ void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr,
unsigned char src_ptr_r2;
unsigned char src_ptr_r3;
unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+ (void)output_width;
vector4a = 64;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
index b79af1cc88f..d2c34425156 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
@@ -306,6 +306,7 @@ void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p,
uint32_t hev;
uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+ (void)count;
mask = 0;
hev = 0;
@@ -498,6 +499,7 @@ void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
uint32_t hev;
uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+ (void)count;
mask = 0;
hev = 0;
@@ -918,6 +920,7 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
unsigned char *s1, *s2, *s3, *s4;
uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ (void)count;
/* loop filter designed to work using chars so that we can make maximum use
* of 8 bit simd instructions.
@@ -1612,6 +1615,7 @@ void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
uint32_t mask, hev;
uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+ (void)count;
mask = 0;
hev = 0;
@@ -1915,6 +1919,7 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
unsigned char *s1, *s2, *s3, *s4;
uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ (void)count;
mask = 0;
hev = 0;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h
index 713f5dffe09..96e3af6c9c1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h
@@ -19,14 +19,7 @@ enum {
VP8D_DEBLOCK = 1 << 0,
VP8D_DEMACROBLOCK = 1 << 1,
VP8D_ADDNOISE = 1 << 2,
- VP8D_DEBUG_TXT_FRAME_INFO = 1 << 3,
- VP8D_DEBUG_TXT_MBLK_MODES = 1 << 4,
- VP8D_DEBUG_TXT_DC_DIFF = 1 << 5,
- VP8D_DEBUG_TXT_RATE_INFO = 1 << 6,
- VP8D_DEBUG_DRAW_MV = 1 << 7,
- VP8D_DEBUG_CLR_BLK_MODES = 1 << 8,
- VP8D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
- VP8D_MFQE = 1 << 10
+ VP8D_MFQE = 1 << 3
};
typedef struct {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
index c0e95b15a0f..bc5e0579999 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
@@ -210,8 +210,9 @@ $vp8_full_search_sad_sse3=vp8_full_search_sadx3;
$vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_refining_search_sad sse3/;
-$vp8_refining_search_sad_sse3=vp8_refining_search_sadx4;
+specialize qw/vp8_refining_search_sad sse2 msa/;
+$vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
+$vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
specialize qw/vp8_diamond_search_sad sse2 msa/;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h
index f27b209c40e..ece64f3fb43 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h
@@ -191,8 +191,47 @@ static inline int sem_destroy(sem_t *sem) {
#define x86_pause_hint()
#endif
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define USE_MUTEX_LOCK 1
+#endif
+#endif
+
#include "vpx_util/vpx_thread.h"
+static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) {
+ (void)mutex;
+#if defined(USE_MUTEX_LOCK)
+ int ret;
+ pthread_mutex_lock(mutex);
+ ret = *p;
+ pthread_mutex_unlock(mutex);
+ return ret;
+#endif
+ return *p;
+}
+
+static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col,
+ const int *last_row_current_mb_col,
+ const int nsync) {
+ while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) {
+ x86_pause_hint();
+ thread_sleep(0);
+ }
+}
+
+static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) {
+ (void)mutex;
+#if defined(USE_MUTEX_LOCK)
+ pthread_mutex_lock(mutex);
+ *p = v;
+ pthread_mutex_unlock(mutex);
+ return;
+#endif
+ *p = v;
+}
+
+#undef USE_MUTEX_LOCK
#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
#ifdef __cplusplus
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h
index e50fafd4f94..88b1ff16bca 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h
@@ -67,7 +67,8 @@ typedef struct VP8D_COMP {
#if CONFIG_MULTITHREAD
/* variable for threading */
- volatile int b_multithreaded_rd;
+
+ int b_multithreaded_rd;
int max_threads;
int current_mb_col_main;
unsigned int decoding_thread_count;
@@ -76,6 +77,8 @@ typedef struct VP8D_COMP {
int mt_baseline_filter_level[MAX_MB_SEGMENTS];
int sync_range;
int *mt_current_mb_col; /* Each row remembers its already decoded column. */
+ pthread_mutex_t *pmutex;
+ pthread_mutex_t mt_mutex; /* mutex for b_multithreaded_rd */
unsigned char **mt_yabove_row; /* mb_rows x width */
unsigned char **mt_uabove_row;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
index 44ca16bfdd4..9f77519882c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
@@ -50,9 +50,6 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
mbd->subpixel_predict8x8 = xd->subpixel_predict8x8;
mbd->subpixel_predict16x16 = xd->subpixel_predict16x16;
- mbd->mode_info_context = pc->mi + pc->mode_info_stride * (i + 1);
- mbd->mode_info_stride = pc->mode_info_stride;
-
mbd->frame_type = pc->frame_type;
mbd->pre = xd->pre;
mbd->dst = xd->dst;
@@ -251,8 +248,8 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
int start_mb_row) {
- volatile const int *last_row_current_mb_col;
- volatile int *current_mb_col;
+ const int *last_row_current_mb_col;
+ int *current_mb_col;
int mb_row;
VP8_COMMON *pc = &pbi->common;
const int nsync = pbi->sync_range;
@@ -289,6 +286,9 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
xd->up_available = (start_mb_row != 0);
+ xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row;
+ xd->mode_info_stride = pc->mode_info_stride;
+
for (mb_row = start_mb_row; mb_row < pc->mb_rows;
mb_row += (pbi->decoding_thread_count + 1)) {
int recon_yoffset, recon_uvoffset;
@@ -318,7 +318,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
xd->left_available = 0;
- xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+ xd->mb_to_top_edge = -((mb_row * 16) << 3);
xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
if (pbi->common.filter_level) {
@@ -355,14 +355,15 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
xd->dst.uv_stride);
}
- for (mb_col = 0; mb_col < pc->mb_cols; mb_col++) {
- *current_mb_col = mb_col - 1;
+ for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) {
+ if (((mb_col - 1) % nsync) == 0) {
+ pthread_mutex_t *mutex = &pbi->pmutex[mb_row];
+ protected_write(mutex, current_mb_col, mb_col - 1);
+ }
- if ((mb_col & (nsync - 1)) == 0) {
- while (mb_col > (*last_row_current_mb_col - nsync)) {
- x86_pause_hint();
- thread_sleep(0);
- }
+ if (mb_row && !(mb_col & (nsync - 1))) {
+ pthread_mutex_t *mutex = &pbi->pmutex[mb_row - 1];
+ sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
}
/* Distance of MB to the various image edges.
@@ -548,7 +549,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
}
/* last MB of row is ready just after extension is done */
- *current_mb_col = mb_col + nsync;
+ protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync);
++xd->mode_info_context; /* skip prediction column */
xd->up_available = 1;
@@ -568,10 +569,10 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
ENTROPY_CONTEXT_PLANES mb_row_left_context;
while (1) {
- if (pbi->b_multithreaded_rd == 0) break;
+ if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) break;
if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
- if (pbi->b_multithreaded_rd == 0) {
+ if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) {
break;
} else {
MACROBLOCKD *xd = &mbrd->mbd;
@@ -591,6 +592,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
pbi->b_multithreaded_rd = 0;
pbi->allocated_decoding_thread_count = 0;
+ pthread_mutex_init(&pbi->mt_mutex, NULL);
/* limit decoding threads to the max number of token partitions */
core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
@@ -647,6 +649,16 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) {
int i;
+ /* De-allocate mutex */
+ if (pbi->pmutex != NULL) {
+ for (i = 0; i < mb_rows; ++i) {
+ pthread_mutex_destroy(&pbi->pmutex[i]);
+ }
+
+ vpx_free(pbi->pmutex);
+ pbi->pmutex = NULL;
+ }
+
vpx_free(pbi->mt_current_mb_col);
pbi->mt_current_mb_col = NULL;
@@ -712,7 +724,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
int i;
int uv_width;
- if (pbi->b_multithreaded_rd) {
+ if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) {
vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
/* our internal buffers are always multiples of 16 */
@@ -730,6 +742,15 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
uv_width = width >> 1;
+ /* Allocate mutex */
+ CHECK_MEM_ERROR(pbi->pmutex,
+ vpx_malloc(sizeof(*pbi->pmutex) * pc->mb_rows));
+ if (pbi->pmutex) {
+ for (i = 0; i < pc->mb_rows; ++i) {
+ pthread_mutex_init(&pbi->pmutex[i], NULL);
+ }
+ }
+
/* Allocate an int for each mb row. */
CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
@@ -772,9 +793,9 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
/* shutdown MB Decoding thread; */
- if (pbi->b_multithreaded_rd) {
+ if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) {
int i;
- pbi->b_multithreaded_rd = 0;
+ protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0);
/* allow all threads to exit */
for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
@@ -804,6 +825,7 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
}
+ pthread_mutex_destroy(&pbi->mt_mutex);
}
void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
index e41d513c1b7..c7ad3bfe2c9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
@@ -345,8 +345,8 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
#if CONFIG_MULTITHREAD
const int nsync = cpi->mt_sync_range;
const int rightmost_col = cm->mb_cols + nsync;
- volatile const int *last_row_current_mb_col;
- volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+ const int *last_row_current_mb_col;
+ int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) {
last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
@@ -419,13 +419,14 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
#if CONFIG_MULTITHREAD
if (cpi->b_multi_threaded != 0) {
- *current_mb_col = mb_col - 1; /* set previous MB done */
+ if (((mb_col - 1) % nsync) == 0) {
+ pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
+ protected_write(mutex, current_mb_col, mb_col - 1);
+ }
- if ((mb_col & (nsync - 1)) == 0) {
- while (mb_col > (*last_row_current_mb_col - nsync)) {
- x86_pause_hint();
- thread_sleep(0);
- }
+ if (mb_row && !(mb_col & (nsync - 1))) {
+ pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1];
+ sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
}
}
#endif
@@ -565,7 +566,9 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
#if CONFIG_MULTITHREAD
- if (cpi->b_multi_threaded != 0) *current_mb_col = rightmost_col;
+ if (cpi->b_multi_threaded != 0) {
+ protected_write(&cpi->pmutex[mb_row], current_mb_col, rightmost_col);
+ }
#endif
/* this is to account for the border */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
index 708002b1e67..df34997accd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
@@ -25,11 +25,11 @@ static THREAD_FUNCTION thread_loopfilter(void *p_data) {
VP8_COMMON *cm = &cpi->common;
while (1) {
- if (cpi->b_multi_threaded == 0) break;
+ if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
if (sem_wait(&cpi->h_event_start_lpf) == 0) {
/* we're shutting down */
- if (cpi->b_multi_threaded == 0) break;
+ if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
vp8_loopfilter_frame(cpi, cm);
@@ -47,7 +47,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
ENTROPY_CONTEXT_PLANES mb_row_left_context;
while (1) {
- if (cpi->b_multi_threaded == 0) break;
+ if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
const int nsync = cpi->mt_sync_range;
@@ -65,7 +65,10 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
int *totalrate = &mbri->totalrate;
/* we're shutting down */
- if (cpi->b_multi_threaded == 0) break;
+ if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+
+ xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1);
+ xd->mode_info_stride = cm->mode_info_stride;
for (mb_row = ithread + 1; mb_row < cm->mb_rows;
mb_row += (cpi->encoding_thread_count + 1)) {
@@ -76,8 +79,8 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
int map_index = (mb_row * cm->mb_cols);
- volatile const int *last_row_current_mb_col;
- volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+ const int *last_row_current_mb_col;
+ int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
@@ -103,13 +106,14 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
/* for each macroblock col in image */
for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
- *current_mb_col = mb_col - 1;
+ if (((mb_col - 1) % nsync) == 0) {
+ pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
+ protected_write(mutex, current_mb_col, mb_col - 1);
+ }
- if ((mb_col & (nsync - 1)) == 0) {
- while (mb_col > (*last_row_current_mb_col - nsync)) {
- x86_pause_hint();
- thread_sleep(0);
- }
+ if (mb_row && !(mb_col & (nsync - 1))) {
+ pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1];
+ sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
}
#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -281,7 +285,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16,
xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
- *current_mb_col = mb_col + nsync;
+ protected_write(&cpi->pmutex[mb_row], current_mb_col, mb_col + nsync);
/* this is to account for the border */
xd->mode_info_context++;
@@ -450,9 +454,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1);
- mbd->mode_info_context = cm->mi + x->e_mbd.mode_info_stride * (i + 1);
- mbd->mode_info_stride = cm->mode_info_stride;
-
mbd->frame_type = cm->frame_type;
mb->src = *cpi->Source;
@@ -492,6 +493,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
cpi->encoding_thread_count = 0;
cpi->b_lpf_running = 0;
+ pthread_mutex_init(&cpi->mt_mutex, NULL);
+
if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) {
int ithread;
int th_count = cpi->oxcf.multi_threaded - 1;
@@ -551,7 +554,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
if (rc) {
/* shutdown other threads */
- cpi->b_multi_threaded = 0;
+ protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
for (--ithread; ithread >= 0; ithread--) {
pthread_join(cpi->h_encoding_thread[ithread], 0);
sem_destroy(&cpi->h_event_start_encoding[ithread]);
@@ -565,6 +568,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
vpx_free(cpi->mb_row_ei);
vpx_free(cpi->en_thread_data);
+ pthread_mutex_destroy(&cpi->mt_mutex);
+
return -1;
}
@@ -579,7 +584,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
if (rc) {
/* shutdown other threads */
- cpi->b_multi_threaded = 0;
+ protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
for (--ithread; ithread >= 0; ithread--) {
sem_post(&cpi->h_event_start_encoding[ithread]);
sem_post(&cpi->h_event_end_encoding[ithread]);
@@ -597,6 +602,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
vpx_free(cpi->mb_row_ei);
vpx_free(cpi->en_thread_data);
+ pthread_mutex_destroy(&cpi->mt_mutex);
+
return -2;
}
}
@@ -605,9 +612,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
}
void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
- if (cpi->b_multi_threaded) {
+ if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded)) {
/* shutdown other threads */
- cpi->b_multi_threaded = 0;
+ protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
{
int i;
@@ -635,5 +642,6 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
vpx_free(cpi->mb_row_ei);
vpx_free(cpi->en_thread_data);
}
+ pthread_mutex_destroy(&cpi->mt_mutex);
}
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
index c5389594553..9717feb136b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
@@ -446,6 +446,18 @@ static void dealloc_compressor_data(VP8_COMP *cpi) {
cpi->mb.pip = 0;
#if CONFIG_MULTITHREAD
+ /* De-allocate mutex */
+ if (cpi->pmutex != NULL) {
+ VP8_COMMON *const pc = &cpi->common;
+ int i;
+
+ for (i = 0; i < pc->mb_rows; ++i) {
+ pthread_mutex_destroy(&cpi->pmutex[i]);
+ }
+ vpx_free(cpi->pmutex);
+ cpi->pmutex = NULL;
+ }
+
vpx_free(cpi->mt_current_mb_col);
cpi->mt_current_mb_col = NULL;
#endif
@@ -1075,6 +1087,9 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
int width = cm->Width;
int height = cm->Height;
+#if CONFIG_MULTITHREAD
+ int prev_mb_rows = cm->mb_rows;
+#endif
if (vp8_alloc_frame_buffers(cm, width, height)) {
vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
@@ -1164,6 +1179,25 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
}
if (cpi->oxcf.multi_threaded > 1) {
+ int i;
+
+ /* De-allocate and re-allocate mutex */
+ if (cpi->pmutex != NULL) {
+ for (i = 0; i < prev_mb_rows; ++i) {
+ pthread_mutex_destroy(&cpi->pmutex[i]);
+ }
+ vpx_free(cpi->pmutex);
+ cpi->pmutex = NULL;
+ }
+
+ CHECK_MEM_ERROR(cpi->pmutex,
+ vpx_malloc(sizeof(*cpi->pmutex) * cm->mb_rows));
+ if (cpi->pmutex) {
+ for (i = 0; i < cm->mb_rows; ++i) {
+ pthread_mutex_init(&cpi->pmutex[i], NULL);
+ }
+ }
+
vpx_free(cpi->mt_current_mb_col);
CHECK_MEM_ERROR(cpi->mt_current_mb_col,
vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
index bfcc6457c19..fe775064a45 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
@@ -511,6 +511,8 @@ typedef struct VP8_COMP {
#if CONFIG_MULTITHREAD
/* multithread data */
+ pthread_mutex_t *pmutex;
+ pthread_mutex_t mt_mutex; /* mutex for b_multi_threaded */
int *mt_current_mb_col;
int mt_sync_range;
int b_multi_threaded;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
index cbd61de90ab..dd1ea03b6b9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -77,10 +77,10 @@ static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
- d26s16 = vqrshrn_n_s32(q13s32, 14);
- d27s16 = vqrshrn_n_s32(q14s32, 14);
- d29s16 = vqrshrn_n_s32(q15s32, 14);
- d28s16 = vqrshrn_n_s32(q10s32, 14);
+ d26s16 = vrshrn_n_s32(q13s32, 14);
+ d27s16 = vrshrn_n_s32(q14s32, 14);
+ d29s16 = vrshrn_n_s32(q15s32, 14);
+ d28s16 = vrshrn_n_s32(q10s32, 14);
q13s16 = vcombine_s16(d26s16, d27s16);
q14s16 = vcombine_s16(d28s16, d29s16);
@@ -125,17 +125,17 @@ static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
q14s32 = vaddq_s32(q11s32, q12s32);
q10s32 = vsubq_s32(q10s32, q12s32);
- d16s16 = vqrshrn_n_s32(q13s32, 14);
- d17s16 = vqrshrn_n_s32(q14s32, 14);
- d18s16 = vqrshrn_n_s32(q15s32, 14);
- d19s16 = vqrshrn_n_s32(q10s32, 14);
+ d16s16 = vrshrn_n_s32(q13s32, 14);
+ d17s16 = vrshrn_n_s32(q14s32, 14);
+ d18s16 = vrshrn_n_s32(q15s32, 14);
+ d19s16 = vrshrn_n_s32(q10s32, 14);
*q8s16 = vcombine_s16(d16s16, d17s16);
*q9s16 = vcombine_s16(d18s16, d19s16);
}
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride, int tx_type) {
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
uint8x8_t d26u8, d27u8;
int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
uint32x2_t d26u32, d27u32;
@@ -151,7 +151,7 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
switch (tx_type) {
case 0: // idct_idct is not supported. Fall back to C
- vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+ vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
return;
case 1: // iadst_idct
// generate constants
@@ -203,11 +203,11 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
q9s16 = vrshrq_n_s16(q9s16, 4);
d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
- dest += dest_stride;
+ dest += stride;
d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
- dest += dest_stride;
+ dest += stride;
d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
- dest += dest_stride;
+ dest += stride;
d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
@@ -217,10 +217,10 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
- dest -= dest_stride;
+ dest -= stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
- dest -= dest_stride;
+ dest -= stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
- dest -= dest_stride;
+ dest -= stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
index f7e0a6d9817..1c739861c38 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -76,10 +76,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
- d8s16 = vqrshrn_n_s32(q2s32, 14);
- d9s16 = vqrshrn_n_s32(q3s32, 14);
- d10s16 = vqrshrn_n_s32(q5s32, 14);
- d11s16 = vqrshrn_n_s32(q6s32, 14);
+ d8s16 = vrshrn_n_s32(q2s32, 14);
+ d9s16 = vrshrn_n_s32(q3s32, 14);
+ d10s16 = vrshrn_n_s32(q5s32, 14);
+ d11s16 = vrshrn_n_s32(q6s32, 14);
q4s16 = vcombine_s16(d8s16, d9s16);
q5s16 = vcombine_s16(d10s16, d11s16);
@@ -93,10 +93,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
- d14s16 = vqrshrn_n_s32(q2s32, 14);
- d15s16 = vqrshrn_n_s32(q3s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q13s32, 14);
+ d14s16 = vrshrn_n_s32(q2s32, 14);
+ d15s16 = vrshrn_n_s32(q3s32, 14);
+ d12s16 = vrshrn_n_s32(q9s32, 14);
+ d13s16 = vrshrn_n_s32(q13s32, 14);
q6s16 = vcombine_s16(d12s16, d13s16);
q7s16 = vcombine_s16(d14s16, d15s16);
@@ -115,10 +115,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
d0s16 = vdup_n_s16(cospi_24_64);
d1s16 = vdup_n_s16(cospi_8_64);
- d18s16 = vqrshrn_n_s32(q2s32, 14);
- d19s16 = vqrshrn_n_s32(q3s32, 14);
- d22s16 = vqrshrn_n_s32(q13s32, 14);
- d23s16 = vqrshrn_n_s32(q15s32, 14);
+ d18s16 = vrshrn_n_s32(q2s32, 14);
+ d19s16 = vrshrn_n_s32(q3s32, 14);
+ d22s16 = vrshrn_n_s32(q13s32, 14);
+ d23s16 = vrshrn_n_s32(q15s32, 14);
*q9s16 = vcombine_s16(d18s16, d19s16);
*q11s16 = vcombine_s16(d22s16, d23s16);
@@ -132,10 +132,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
- d26s16 = vqrshrn_n_s32(q2s32, 14);
- d27s16 = vqrshrn_n_s32(q3s32, 14);
- d30s16 = vqrshrn_n_s32(q8s32, 14);
- d31s16 = vqrshrn_n_s32(q12s32, 14);
+ d26s16 = vrshrn_n_s32(q2s32, 14);
+ d27s16 = vrshrn_n_s32(q3s32, 14);
+ d30s16 = vrshrn_n_s32(q8s32, 14);
+ d31s16 = vrshrn_n_s32(q12s32, 14);
*q13s16 = vcombine_s16(d26s16, d27s16);
*q15s16 = vcombine_s16(d30s16, d31s16);
@@ -165,10 +165,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
- d10s16 = vqrshrn_n_s32(q9s32, 14);
- d11s16 = vqrshrn_n_s32(q10s32, 14);
- d12s16 = vqrshrn_n_s32(q11s32, 14);
- d13s16 = vqrshrn_n_s32(q12s32, 14);
+ d10s16 = vrshrn_n_s32(q9s32, 14);
+ d11s16 = vrshrn_n_s32(q10s32, 14);
+ d12s16 = vrshrn_n_s32(q11s32, 14);
+ d13s16 = vrshrn_n_s32(q12s32, 14);
q5s16 = vcombine_s16(d10s16, d11s16);
q6s16 = vcombine_s16(d12s16, d13s16);
@@ -242,8 +242,8 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
q1s32 = vsubq_s32(q1s32, q5s32);
q2s32 = vsubq_s32(q2s32, q6s32);
- d22s16 = vqrshrn_n_s32(q11s32, 14);
- d23s16 = vqrshrn_n_s32(q12s32, 14);
+ d22s16 = vrshrn_n_s32(q11s32, 14);
+ d23s16 = vrshrn_n_s32(q12s32, 14);
*q11s16 = vcombine_s16(d22s16, d23s16);
q12s32 = vaddq_s32(q3s32, q7s32);
@@ -251,12 +251,12 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
q3s32 = vsubq_s32(q3s32, q7s32);
q4s32 = vsubq_s32(q4s32, q8s32);
- d2s16 = vqrshrn_n_s32(q1s32, 14);
- d3s16 = vqrshrn_n_s32(q2s32, 14);
- d24s16 = vqrshrn_n_s32(q12s32, 14);
- d25s16 = vqrshrn_n_s32(q15s32, 14);
- d6s16 = vqrshrn_n_s32(q3s32, 14);
- d7s16 = vqrshrn_n_s32(q4s32, 14);
+ d2s16 = vrshrn_n_s32(q1s32, 14);
+ d3s16 = vrshrn_n_s32(q2s32, 14);
+ d24s16 = vrshrn_n_s32(q12s32, 14);
+ d25s16 = vrshrn_n_s32(q15s32, 14);
+ d6s16 = vrshrn_n_s32(q3s32, 14);
+ d7s16 = vrshrn_n_s32(q4s32, 14);
*q12s16 = vcombine_s16(d24s16, d25s16);
d0s16 = vdup_n_s16(cospi_10_64);
@@ -291,10 +291,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
q2s32 = vsubq_s32(q2s32, q10s32);
q6s32 = vsubq_s32(q6s32, q9s32);
- d28s16 = vqrshrn_n_s32(q14s32, 14);
- d29s16 = vqrshrn_n_s32(q15s32, 14);
- d4s16 = vqrshrn_n_s32(q2s32, 14);
- d5s16 = vqrshrn_n_s32(q6s32, 14);
+ d28s16 = vrshrn_n_s32(q14s32, 14);
+ d29s16 = vrshrn_n_s32(q15s32, 14);
+ d4s16 = vrshrn_n_s32(q2s32, 14);
+ d5s16 = vrshrn_n_s32(q6s32, 14);
*q14s16 = vcombine_s16(d28s16, d29s16);
q9s32 = vaddq_s32(q4s32, q0s32);
@@ -305,10 +305,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
d30s16 = vdup_n_s16(cospi_8_64);
d31s16 = vdup_n_s16(cospi_24_64);
- d18s16 = vqrshrn_n_s32(q9s32, 14);
- d19s16 = vqrshrn_n_s32(q10s32, 14);
- d8s16 = vqrshrn_n_s32(q4s32, 14);
- d9s16 = vqrshrn_n_s32(q5s32, 14);
+ d18s16 = vrshrn_n_s32(q9s32, 14);
+ d19s16 = vrshrn_n_s32(q10s32, 14);
+ d8s16 = vrshrn_n_s32(q4s32, 14);
+ d9s16 = vrshrn_n_s32(q5s32, 14);
*q9s16 = vcombine_s16(d18s16, d19s16);
q5s32 = vmull_s16(d2s16, d30s16);
@@ -341,10 +341,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
q5s32 = vsubq_s32(q5s32, q1s32);
q6s32 = vsubq_s32(q6s32, q3s32);
- d18s16 = vqrshrn_n_s32(q14s32, 14);
- d19s16 = vqrshrn_n_s32(q15s32, 14);
- d10s16 = vqrshrn_n_s32(q5s32, 14);
- d11s16 = vqrshrn_n_s32(q6s32, 14);
+ d18s16 = vrshrn_n_s32(q14s32, 14);
+ d19s16 = vrshrn_n_s32(q15s32, 14);
+ d10s16 = vrshrn_n_s32(q5s32, 14);
+ d11s16 = vrshrn_n_s32(q6s32, 14);
*q9s16 = vcombine_s16(d18s16, d19s16);
q1s32 = vaddq_s32(q7s32, q10s32);
@@ -352,10 +352,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
q7s32 = vsubq_s32(q7s32, q10s32);
q0s32 = vsubq_s32(q0s32, q2s32);
- d28s16 = vqrshrn_n_s32(q1s32, 14);
- d29s16 = vqrshrn_n_s32(q3s32, 14);
- d14s16 = vqrshrn_n_s32(q7s32, 14);
- d15s16 = vqrshrn_n_s32(q0s32, 14);
+ d28s16 = vrshrn_n_s32(q1s32, 14);
+ d29s16 = vrshrn_n_s32(q3s32, 14);
+ d14s16 = vrshrn_n_s32(q7s32, 14);
+ d15s16 = vrshrn_n_s32(q0s32, 14);
*q14s16 = vcombine_s16(d28s16, d29s16);
d30s16 = vdup_n_s16(cospi_16_64);
@@ -374,10 +374,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
- d4s16 = vqrshrn_n_s32(q2s32, 14);
- d5s16 = vqrshrn_n_s32(q3s32, 14);
- d24s16 = vqrshrn_n_s32(q13s32, 14);
- d25s16 = vqrshrn_n_s32(q1s32, 14);
+ d4s16 = vrshrn_n_s32(q2s32, 14);
+ d5s16 = vrshrn_n_s32(q3s32, 14);
+ d24s16 = vrshrn_n_s32(q13s32, 14);
+ d25s16 = vrshrn_n_s32(q1s32, 14);
q2s16 = vcombine_s16(d4s16, d5s16);
*q12s16 = vcombine_s16(d24s16, d25s16);
@@ -391,10 +391,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
- d20s16 = vqrshrn_n_s32(q13s32, 14);
- d21s16 = vqrshrn_n_s32(q1s32, 14);
- d12s16 = vqrshrn_n_s32(q11s32, 14);
- d13s16 = vqrshrn_n_s32(q0s32, 14);
+ d20s16 = vrshrn_n_s32(q13s32, 14);
+ d21s16 = vrshrn_n_s32(q1s32, 14);
+ d12s16 = vrshrn_n_s32(q11s32, 14);
+ d13s16 = vrshrn_n_s32(q0s32, 14);
*q10s16 = vcombine_s16(d20s16, d21s16);
q6s16 = vcombine_s16(d12s16, d13s16);
@@ -406,8 +406,8 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
*q15s16 = vsubq_s16(q5s16, q4s16);
}
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride, int tx_type) {
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
int i;
uint8_t *d1, *d2;
uint8x8_t d0u8, d1u8, d2u8, d3u8;
@@ -429,7 +429,7 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
switch (tx_type) {
case 0: // idct_idct is not supported. Fall back to C
- vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+ vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
return;
case 1: // iadst_idct
// generate IDCT constants
@@ -508,13 +508,13 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
}
d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
+ d1 += stride;
d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
+ d1 += stride;
d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
+ d1 += stride;
d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
+ d1 += stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
@@ -529,12 +529,12 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
+ d2 += stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
+ d2 += stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
+ d2 += stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
+ d2 += stride;
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
index 2d4839174db..f6b29265e66 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -21,8 +21,8 @@
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride, int tx_type) {
+void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
int i, j;
DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
int16_t *outptr = out;
@@ -37,7 +37,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
switch (tx_type) {
case DCT_DCT: // DCT in both horizontal and vertical
vpx_idct4_rows_dspr2(input, outptr);
- vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride);
break;
case ADST_DCT: // ADST in vertical, DCT in horizontal
vpx_idct4_rows_dspr2(input, outptr);
@@ -48,8 +48,8 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
iadst4_dspr2(outptr, temp_out);
for (j = 0; j < 4; ++j)
- dest[j * dest_stride + i] = clip_pixel(
- ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) +
+ dest[j * stride + i]);
outptr += 4;
}
@@ -66,7 +66,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
temp_in[i * 4 + j] = out[j * 4 + i];
}
}
- vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, stride);
break;
case ADST_ADST: // ADST in both directions
for (i = 0; i < 4; ++i) {
@@ -80,8 +80,8 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
iadst4_dspr2(temp_in, temp_out);
for (j = 0; j < 4; ++j)
- dest[j * dest_stride + i] = clip_pixel(
- ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) +
+ dest[j * stride + i]);
}
break;
default: printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); break;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index 86896f04ca5..b945e307e63 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -20,8 +20,8 @@
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
-void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride, int tx_type) {
+void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
int i, j;
DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
int16_t *outptr = out;
@@ -34,7 +34,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
switch (tx_type) {
case DCT_DCT: // DCT in both horizontal and vertical
idct8_rows_dspr2(input, outptr, 8);
- idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ idct8_columns_add_blk_dspr2(&out[0], dest, stride);
break;
case ADST_DCT: // ADST in vertical, DCT in horizontal
idct8_rows_dspr2(input, outptr, 8);
@@ -43,8 +43,8 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
iadst8_dspr2(&out[i * 8], temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(
- ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) +
+ dest[j * stride + i]);
}
break;
case DCT_ADST: // DCT in vertical, ADST in horizontal
@@ -59,7 +59,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
temp_in[i * 8 + j] = out[j * 8 + i];
}
}
- idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ idct8_columns_add_blk_dspr2(&temp_in[0], dest, stride);
break;
case ADST_ADST: // ADST in both directions
for (i = 0; i < 8; ++i) {
@@ -74,8 +74,8 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
iadst8_dspr2(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(
- ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) +
+ dest[j * stride + i]);
}
break;
default: printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); break;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
index 6dcfa412bee..b8b647bf18d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
@@ -20,14 +20,7 @@ enum {
VP9D_DEBLOCK = 1 << 0,
VP9D_DEMACROBLOCK = 1 << 1,
VP9D_ADDNOISE = 1 << 2,
- VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3,
- VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4,
- VP9D_DEBUG_TXT_DC_DIFF = 1 << 5,
- VP9D_DEBUG_TXT_RATE_INFO = 1 << 6,
- VP9D_DEBUG_DRAW_MV = 1 << 7,
- VP9D_DEBUG_CLR_BLK_MODES = 1 << 8,
- VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
- VP9D_MFQE = 1 << 10
+ VP9D_MFQE = 1 << 3
};
typedef struct {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
index abef0676396..088b004f528 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -48,16 +48,16 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
} else {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
specialize qw/vp9_iht4x4_16_add sse2/;
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
specialize qw/vp9_iht8x8_64_add sse2/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
@@ -66,16 +66,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
} else {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
@@ -101,9 +101,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
- add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
- add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index 072d92e4e91..3dc88b1914e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -128,16 +128,20 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
int bits_per_mb;
int num8x8bl = cm->MBs << 2;
+ // Compute delta-q corresponding to qindex i.
+ int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
// Weight for segment prior to encoding: take the average of the target
// number for the frame to be encoded and the actual from the previous frame.
+ // Use the target if its less.
int target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+ double weight_segment_target = (double)(target_refresh) / num8x8bl;
double weight_segment =
(double)((target_refresh + cr->actual_num_seg1_blocks +
cr->actual_num_seg2_blocks) >>
1) /
num8x8bl;
- // Compute delta-q corresponding to qindex i.
- int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+ if (weight_segment_target < 7 * weight_segment / 8)
+ weight_segment = weight_segment_target;
// Take segment weighted average for bits per mb.
bits_per_mb = (int)((1.0 - weight_segment) *
vp9_rc_bits_per_mb(cm->frame_type, i,
@@ -383,13 +387,14 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
: vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
// More aggressive settings for noisy content.
if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
- consec_zero_mv_thresh = 80;
+ consec_zero_mv_thresh = 60;
qindex_thresh =
VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
- 7 * cm->base_qindex >> 3);
+ cm->base_qindex);
}
do {
int sum_map = 0;
+ int consec_zero_mv_thresh_block = consec_zero_mv_thresh;
// Get the mi_row/mi_col corresponding to superblock index i.
int sb_row_index = (i / sb_cols);
int sb_col_index = i - sb_row_index * sb_cols;
@@ -403,6 +408,9 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
ymis =
VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
+ if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium &&
+ (xmis <= 2 || ymis <= 2))
+ consec_zero_mv_thresh_block = 10;
for (y = 0; y < ymis; y++) {
for (x = 0; x < xmis; x++) {
const int bl_index2 = bl_index + y * cm->mi_cols + x;
@@ -412,7 +420,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
if (cr->map[bl_index2] == 0) {
count_tot++;
if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
- cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
+ cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) {
sum_map++;
count_sel++;
}
@@ -468,8 +476,8 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
}
// Adjust some parameters for low resolutions at low bitrates.
if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) {
- cr->motion_thresh = 4;
- cr->rate_boost_fac = 10;
+ cr->motion_thresh = 16;
+ cr->rate_boost_fac = 13;
}
if (cpi->svc.spatial_layer_id > 0) {
cr->motion_thresh = 4;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
index 3ab05375ff7..323c053edff 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -477,8 +477,8 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
thresholds[2] = threshold_base >> 2;
thresholds[3] = threshold_base << 2;
} else {
- // Increase base variance threshold based on estimated noise level.
- if (cpi->noise_estimate.enabled) {
+ // Increase base variance threshold based on estimated noise level.
+ if (cpi->noise_estimate.enabled && cm->width >= 640 && cm->height >= 480) {
NOISE_LEVEL noise_level =
vp9_noise_estimate_extract_level(&cpi->noise_estimate);
if (noise_level == kHigh)
@@ -526,6 +526,7 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
: 1000;
cpi->vbp_bsize_min = BLOCK_16X16;
}
+ cpi->vbp_threshold_copy = cpi->vbp_thresholds[0] << 16;
cpi->vbp_threshold_minmax = 15 + (q >> 3);
}
}
@@ -742,9 +743,13 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
continue;
if ((*this_mi)->sb_type == BLOCK_32X32) {
- if (vt->split[i].part_variances.none.variance < (thresholds[1] >> 1))
+ int64_t threshold_32x32 = (cpi->sf.short_circuit_low_temp_var == 1 ||
+ cpi->sf.short_circuit_low_temp_var == 3)
+ ? ((5 * thresholds[1]) >> 3)
+ : (thresholds[1] >> 1);
+ if (vt->split[i].part_variances.none.variance < threshold_32x32)
x->variance_low[i + 5] = 1;
- } else if (cpi->sf.short_circuit_low_temp_var == 2) {
+ } else if (cpi->sf.short_circuit_low_temp_var >= 2) {
// For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
// inside.
if ((*this_mi)->sb_type == BLOCK_16X16 ||
@@ -762,6 +767,93 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
}
}
+static void copy_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE *prev_part = cpi->prev_partition;
+ int start_pos = mi_row * cm->mi_stride + mi_col;
+
+ const int bsl = b_width_log2_lookup[bsize];
+ const int bs = (1 << bsl) / 4;
+ BLOCK_SIZE subsize;
+ PARTITION_TYPE partition;
+ MODE_INFO *mi = NULL;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ partition = partition_lookup[bsl][prev_part[start_pos]];
+ subsize = get_subsize(bsize, partition);
+ mi = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+
+ if (subsize < BLOCK_8X8) {
+ mi->sb_type = bsize;
+ } else {
+ switch (partition) {
+ case PARTITION_NONE: mi->sb_type = bsize; break;
+ case PARTITION_HORZ:
+ mi->sb_type = subsize;
+ if (mi_row + bs < cm->mi_rows)
+ cm->mi_grid_visible[(mi_row + bs) * cm->mi_stride + mi_col]->sb_type =
+ subsize;
+ break;
+ case PARTITION_VERT:
+ mi->sb_type = subsize;
+ if (mi_col + bs < cm->mi_cols)
+ cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col + bs]->sb_type =
+ subsize;
+ break;
+ case PARTITION_SPLIT:
+ copy_prev_partition(cpi, subsize, mi_row, mi_col);
+ copy_prev_partition(cpi, subsize, mi_row + bs, mi_col);
+ copy_prev_partition(cpi, subsize, mi_row, mi_col + bs);
+ copy_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
+static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE *prev_part = cpi->prev_partition;
+ int start_pos = mi_row * cm->mi_stride + mi_col;
+ const int bsl = b_width_log2_lookup[bsize];
+ const int bs = (1 << bsl) / 4;
+ BLOCK_SIZE subsize;
+ PARTITION_TYPE partition;
+ const MODE_INFO *mi = NULL;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ mi = cm->mi_grid_visible[start_pos];
+ partition = partition_lookup[bsl][mi->sb_type];
+ subsize = get_subsize(bsize, partition);
+ if (subsize < BLOCK_8X8) {
+ prev_part[start_pos] = bsize;
+ } else {
+ switch (partition) {
+ case PARTITION_NONE: prev_part[start_pos] = bsize; break;
+ case PARTITION_HORZ:
+ prev_part[start_pos] = subsize;
+ if (mi_row + bs < cm->mi_rows)
+ prev_part[start_pos + bs * cm->mi_stride] = subsize;
+ break;
+ case PARTITION_VERT:
+ prev_part[start_pos] = subsize;
+ if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
+ break;
+ case PARTITION_SPLIT:
+ update_prev_partition(cpi, subsize, mi_row, mi_col);
+ update_prev_partition(cpi, subsize, mi_row + bs, mi_col);
+ update_prev_partition(cpi, subsize, mi_row, mi_col + bs);
+ update_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
unsigned int y_sad, int is_key_frame) {
int i;
@@ -824,6 +916,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
const int low_res = (cm->width <= 352 && cm->height <= 288);
int variance4x4downsample[16];
int segment_id;
+ int offset = cm->mi_stride * mi_row + mi_col;
set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
segment_id = xd->mi[0]->segment_id;
@@ -834,8 +927,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
}
}
- threshold_4x4avg =
- (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : thresholds[2] >> 1;
+ // For non keyframes, disable 4x4 average for low resolution when speed = 8
+ threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX;
memset(x->variance_low, 0, sizeof(x->variance_low));
@@ -857,7 +950,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
const YV12_BUFFER_CONFIG *yv12_g = NULL;
- unsigned int y_sad_g, y_sad_thr;
+ unsigned int y_sad_g, y_sad_thr, y_sad_last;
bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 +
(mi_row + 4 < cm->mi_rows);
@@ -897,6 +990,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
mi->interp_filter = BILINEAR;
y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+ y_sad_last = y_sad;
// Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
// are close if short_circuit_low_temp_var is on.
y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad;
@@ -937,6 +1031,20 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
return 0;
}
}
+
+ // If the y_sad is small enough, copy the partition of the superblock in the
+ // last frame to current frame only if the last frame is not a keyframe.
+ // TODO(jianj) : tune the threshold.
+ if (cpi->sf.copy_partition_flag && cpi->rc.frames_since_key > 1 &&
+ segment_id == CR_SEGMENT_ID_BASE &&
+ cpi->prev_segment_id[offset] == CR_SEGMENT_ID_BASE &&
+ y_sad_last < cpi->vbp_threshold_copy) {
+ if (cpi->prev_partition != NULL) {
+ copy_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col);
+ chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+ return 0;
+ }
+ }
} else {
d = VP9_VAR_OFFS;
dp = 0;
@@ -1131,6 +1239,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
}
}
+ if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) {
+ update_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col);
+ cpi->prev_segment_id[offset] = segment_id;
+ }
+
if (cpi->sf.short_circuit_low_temp_var) {
set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition,
mi_col, mi_row);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
index 20ebe68197e..2cb137d8b93 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
@@ -109,6 +109,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
int64_t error0, error1;
int16_t t0, t1;
EXTRABIT e0;
+ unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ mb->token_costs[tx_size][type][ref];
int best, band, pt, i, final_eob;
#if CONFIG_VP9_HIGHBITDEPTH
const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
@@ -137,7 +139,6 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
int x = qcoeff[rc];
/* Only add a trellis state for non-zero coefficients. */
if (x) {
- int shortcut = 0;
error0 = tokens[next][0].error;
error1 = tokens[next][1].error;
/* Evaluate the first possibility for this state. */
@@ -148,10 +149,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
if (next < default_eob) {
band = band_translate[i + 1];
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
- rate0 += mb->token_costs[tx_size][type][ref][band][0][pt]
- [tokens[next][0].token];
- rate1 += mb->token_costs[tx_size][type][ref][band][0][pt]
- [tokens[next][1].token];
+ rate0 += token_costs[band][0][pt][tokens[next][0].token];
+ rate1 += token_costs[band][0][pt][tokens[next][1].token];
}
UPDATE_RD_COST();
/* And pick the best. */
@@ -178,12 +177,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
(abs(x) * dequant_ptr[rc != 0] <
- (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0]))
- shortcut = 1;
- else
- shortcut = 0;
-
- if (shortcut) {
+ (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) {
sz = -(x < 0);
x -= 2 * sz + 1;
} else {
@@ -208,13 +202,11 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
band = band_translate[i + 1];
if (t0 != EOB_TOKEN) {
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
- rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
- [tokens[next][0].token];
+ rate0 += token_costs[band][!x][pt][tokens[next][0].token];
}
if (t1 != EOB_TOKEN) {
pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
- rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
- [tokens[next][1].token];
+ rate1 += token_costs[band][!x][pt][tokens[next][1].token];
}
}
@@ -223,18 +215,17 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
best = rd_cost1 < rd_cost0;
base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
- if (shortcut) {
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
- } else {
- dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
- }
-#else
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
+ } else {
dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-#endif // CONFIG_VP9_HIGHBITDEPTH
- d2 = dx * dx;
}
+#else
+ dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ d2 = dx * dx;
+
tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
tokens[i][1].error = d2 + (best ? error1 : error0);
tokens[i][1].next = next;
@@ -270,13 +261,11 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
t1 = tokens[next][1].token;
/* Update the cost of each path if we're past the EOB token. */
if (t0 != EOB_TOKEN) {
- tokens[next][0].rate +=
- mb->token_costs[tx_size][type][ref][band][1][pt][t0];
+ tokens[next][0].rate += token_costs[band][1][pt][t0];
tokens[next][0].token = ZERO_TOKEN;
}
if (t1 != EOB_TOKEN) {
- tokens[next][1].rate +=
- mb->token_costs[tx_size][type][ref][band][1][pt][t1];
+ tokens[next][1].rate += token_costs[band][1][pt][t1];
tokens[next][1].token = ZERO_TOKEN;
}
tokens[i][0].best_index = tokens[i][1].best_index = 0;
@@ -292,8 +281,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
error1 = tokens[next][1].error;
t0 = tokens[next][0].token;
t1 = tokens[next][1].token;
- rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0];
- rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1];
+ rate0 += token_costs[band][0][ctx][t0];
+ rate1 += token_costs[band][0][ctx][t1];
UPDATE_RD_COST();
best = rd_cost1 < rd_cost0;
final_eob = -1;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
index 2a58003829c..432eac8da00 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
@@ -108,7 +108,7 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) {
}
/* clang-format off */
-static const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
+const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
{ LEVEL_1, 829440, 36864, 200, 400, 2, 1, 4, 8 },
{ LEVEL_1_1, 2764800, 73728, 800, 1000, 2, 1, 4, 8 },
{ LEVEL_2, 4608000, 122880, 1800, 1500, 2, 1, 4, 8 },
@@ -128,6 +128,16 @@ static const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
};
/* clang-format on */
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] =
+ { "The average bit-rate is too high.",
+ "The picture size is too large.",
+ "The luma sample rate is too large.",
+ "The CPB size is too large.",
+ "The compression ratio is too small",
+ "Too many column tiles are used.",
+ "The alt-ref distance is too small.",
+ "Too many reference buffers are used." };
+
static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
switch (mode) {
case NORMAL:
@@ -224,8 +234,9 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
for (i = 0; i < VP9_LEVELS; ++i) {
this_level = &vp9_level_defs[i];
- if ((double)level_spec->max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P) >
- (double)this_level->max_luma_sample_rate ||
+ if ((double)level_spec->max_luma_sample_rate >
+ (double)this_level->max_luma_sample_rate *
+ (1 + SAMPLE_RATE_GRACE_P) ||
level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
level_spec->average_bitrate > this_level->average_bitrate ||
level_spec->max_cpb_size > this_level->max_cpb_size ||
@@ -439,6 +450,12 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
cpi->nmvsadcosts_hp[0] = NULL;
cpi->nmvsadcosts_hp[1] = NULL;
+ vpx_free(cpi->prev_partition);
+ cpi->prev_partition = NULL;
+
+ vpx_free(cpi->prev_segment_id);
+ cpi->prev_segment_id = NULL;
+
vp9_cyclic_refresh_free(cpi->cyclic_refresh);
cpi->cyclic_refresh = NULL;
@@ -872,6 +889,22 @@ static void init_buffer_indices(VP9_COMP *cpi) {
cpi->alt_fb_idx = 2;
}
+static void init_level_constraint(LevelConstraint *lc) {
+ lc->level_index = -1;
+ lc->max_cpb_size = INT_MAX;
+ lc->max_frame_size = INT_MAX;
+ lc->rc_config_updated = 0;
+ lc->fail_flag = 0;
+}
+
+static void set_level_constraint(LevelConstraint *ls, int8_t level_index) {
+ vpx_clear_system_state();
+ ls->level_index = level_index;
+ if (level_index >= 0) {
+ ls->max_cpb_size = vp9_level_defs[level_index].max_cpb_size * (double)1000;
+ }
+}
+
static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
VP9_COMMON *const cm = &cpi->common;
@@ -887,6 +920,8 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
cpi->target_level = oxcf->target_level;
cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+ set_level_constraint(&cpi->level_constraint,
+ get_level_index(cpi->target_level));
cm->width = oxcf->width;
cm->height = oxcf->height;
@@ -1403,6 +1438,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
cpi->target_level = oxcf->target_level;
cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+ set_level_constraint(&cpi->level_constraint,
+ get_level_index(cpi->target_level));
if (cm->profile <= PROFILE_1)
assert(cm->bit_depth == VPX_BITS_8);
@@ -1679,6 +1716,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
init_level_info(&cpi->level_info);
+ init_level_constraint(&cpi->level_constraint);
#if CONFIG_INTERNAL_STATS
cpi->b_calculate_blockiness = 1;
@@ -3127,7 +3165,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
if (cpi->oxcf.pass == 0 && cpi->oxcf.mode == REALTIME &&
cpi->oxcf.speed >= 5 && cpi->resize_state == 0 &&
(cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
- cpi->oxcf.rc_mode == VPX_VBR) &&
+ cpi->oxcf.rc_mode == VPX_VBR || cpi->sf.copy_partition_flag) &&
cm->show_frame)
vp9_avg_source_sad(cpi);
@@ -3238,9 +3276,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
int frame_over_shoot_limit;
int frame_under_shoot_limit;
int q = 0, q_low = 0, q_high = 0;
+ int enable_acl;
set_size_independent_vars(cpi);
+ enable_acl = cpi->sf.allow_acl
+ ? (cm->frame_type == KEY_FRAME) || (cm->show_frame == 0)
+ : 0;
+
do {
vpx_clear_system_state();
@@ -3335,7 +3378,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size);
rc->projected_frame_size = (int)(*size) << 3;
- restore_coding_context(cpi);
if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
}
@@ -3505,7 +3547,22 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
++cpi->tot_recode_hits;
#endif
}
+
+ if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF)
+ if (loop || !enable_acl) restore_coding_context(cpi);
} while (loop);
+
+ if (enable_acl) {
+ vp9_encode_frame(cpi);
+ vpx_clear_system_state();
+ restore_coding_context(cpi);
+ vp9_pack_bitstream(cpi, dest, size);
+
+ vp9_encode_frame(cpi);
+ vpx_clear_system_state();
+
+ restore_coding_context(cpi);
+ }
}
static int get_ref_frame_flags(const VP9_COMP *cpi) {
@@ -4288,6 +4345,26 @@ static void adjust_image_stat(double y, double u, double v, double all,
}
#endif // CONFIG_INTERNAL_STATS
+// Adjust the maximum allowable frame size for the target level.
+static void level_rc_framerate(VP9_COMP *cpi, int arf_src_index) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ LevelConstraint *const ls = &cpi->level_constraint;
+ VP9_COMMON *const cm = &cpi->common;
+ const double max_cpb_size = ls->max_cpb_size;
+ vpx_clear_system_state();
+ rc->max_frame_bandwidth = VPXMIN(rc->max_frame_bandwidth, ls->max_frame_size);
+ if (frame_is_intra_only(cm)) {
+ rc->max_frame_bandwidth =
+ VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.5));
+ } else if (arf_src_index > 0) {
+ rc->max_frame_bandwidth =
+ VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.4));
+ } else {
+ rc->max_frame_bandwidth =
+ VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.2));
+ }
+}
+
static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
VP9_COMMON *const cm = &cpi->common;
Vp9LevelInfo *const level_info = &cpi->level_info;
@@ -4296,6 +4373,8 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
int i, idx;
uint64_t luma_samples, dur_end;
const uint32_t luma_pic_size = cm->width * cm->height;
+ LevelConstraint *const level_constraint = &cpi->level_constraint;
+ const int8_t level_index = level_constraint->level_index;
double cpb_data_size;
vpx_clear_system_state();
@@ -4406,6 +4485,78 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
if (level_spec->max_col_tiles < (1 << cm->log2_tile_cols)) {
level_spec->max_col_tiles = (1 << cm->log2_tile_cols);
}
+
+ if (level_index >= 0 && level_constraint->fail_flag == 0) {
+ if (level_spec->max_luma_picture_size >
+ vp9_level_defs[level_index].max_luma_picture_size) {
+ level_constraint->fail_flag |= (1 << LUMA_PIC_SIZE_TOO_LARGE);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]);
+ }
+
+ if ((double)level_spec->max_luma_sample_rate >
+ (double)vp9_level_defs[level_index].max_luma_sample_rate *
+ (1 + SAMPLE_RATE_GRACE_P)) {
+ level_constraint->fail_flag |= (1 << LUMA_SAMPLE_RATE_TOO_LARGE);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[LUMA_SAMPLE_RATE_TOO_LARGE]);
+ }
+
+ if (level_spec->max_col_tiles > vp9_level_defs[level_index].max_col_tiles) {
+ level_constraint->fail_flag |= (1 << TOO_MANY_COLUMN_TILE);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[TOO_MANY_COLUMN_TILE]);
+ }
+
+ if (level_spec->min_altref_distance <
+ vp9_level_defs[level_index].min_altref_distance) {
+ level_constraint->fail_flag |= (1 << ALTREF_DIST_TOO_SMALL);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[ALTREF_DIST_TOO_SMALL]);
+ }
+
+ if (level_spec->max_ref_frame_buffers >
+ vp9_level_defs[level_index].max_ref_frame_buffers) {
+ level_constraint->fail_flag |= (1 << TOO_MANY_REF_BUFFER);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[TOO_MANY_REF_BUFFER]);
+ }
+
+ if (level_spec->max_cpb_size > vp9_level_defs[level_index].max_cpb_size) {
+ level_constraint->fail_flag |= (1 << CPB_TOO_LARGE);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[CPB_TOO_LARGE]);
+ }
+
+ // Set an upper bound for the next frame size. It will be used in
+ // level_rc_framerate() before encoding the next frame.
+ cpb_data_size = 0;
+ for (i = 0; i < CPB_WINDOW_SIZE - 1; ++i) {
+ if (i >= level_stats->frame_window_buffer.len) break;
+ idx = (level_stats->frame_window_buffer.start +
+ level_stats->frame_window_buffer.len - 1 - i) %
+ FRAME_WINDOW_SIZE;
+ cpb_data_size += level_stats->frame_window_buffer.buf[idx].size;
+ }
+ cpb_data_size = cpb_data_size / 125.0;
+ level_constraint->max_frame_size =
+ (int)((vp9_level_defs[level_index].max_cpb_size - cpb_data_size) *
+ 1000.0);
+ if (level_stats->frame_window_buffer.len < CPB_WINDOW_SIZE - 1)
+ level_constraint->max_frame_size >>= 1;
+ }
}
int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
@@ -4633,6 +4784,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
set_frame_size(cpi);
}
+ if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 &&
+ cpi->level_constraint.fail_flag == 0)
+ level_rc_framerate(cpi, arf_src_index);
+
if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) {
for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
index 0007e6395da..de324d3aab9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
@@ -237,7 +237,7 @@ typedef struct VP9EncoderConfig {
int max_threads;
- int target_level;
+ unsigned int target_level;
vpx_fixed_buf_t two_pass_stats_in;
struct vpx_codec_pkt_list *output_pkt_list;
@@ -341,6 +341,8 @@ typedef struct {
uint8_t max_ref_frame_buffers;
} Vp9LevelSpec;
+extern const Vp9LevelSpec vp9_level_defs[VP9_LEVELS];
+
typedef struct {
int64_t ts; // timestamp
uint32_t luma_samples;
@@ -368,6 +370,26 @@ typedef struct {
Vp9LevelSpec level_spec;
} Vp9LevelInfo;
+typedef enum {
+ BITRATE_TOO_LARGE = 0,
+ LUMA_PIC_SIZE_TOO_LARGE = 1,
+ LUMA_SAMPLE_RATE_TOO_LARGE = 2,
+ CPB_TOO_LARGE = 3,
+ COMPRESSION_RATIO_TOO_SMALL = 4,
+ TOO_MANY_COLUMN_TILE = 5,
+ ALTREF_DIST_TOO_SMALL = 6,
+ TOO_MANY_REF_BUFFER = 7,
+ TARGET_LEVEL_FAIL_IDS = 8
+} TARGET_LEVEL_FAIL_ID;
+
+typedef struct {
+ int8_t level_index;
+ uint8_t rc_config_updated;
+ uint8_t fail_flag;
+ int max_frame_size; // in bits
+ double max_cpb_size; // in bits
+} LevelConstraint;
+
typedef struct VP9_COMP {
QUANTS quants;
ThreadData td;
@@ -594,6 +616,8 @@ typedef struct VP9_COMP {
int64_t vbp_thresholds[4];
int64_t vbp_threshold_minmax;
int64_t vbp_threshold_sad;
+ // Threshold used for partition copy
+ int64_t vbp_threshold_copy;
BLOCK_SIZE vbp_bsize_min;
// Multi-threading
@@ -605,6 +629,12 @@ typedef struct VP9_COMP {
int keep_level_stats;
Vp9LevelInfo level_info;
+
+ // Previous Partition Info
+ BLOCK_SIZE *prev_partition;
+ int8_t *prev_segment_id;
+
+ LevelConstraint level_constraint;
} VP9_COMP;
void vp9_initialize_enc(void);
@@ -760,6 +790,14 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
}
+static INLINE int get_level_index(VP9_LEVEL level) {
+ int i;
+ for (i = 0; i < VP9_LEVELS; ++i) {
+ if (level == vp9_level_defs[i].level) return i;
+ }
+ return -1;
+}
+
VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
void vp9_new_framerate(VP9_COMP *cpi, double framerate);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
index 788952d3467..72e9ac77e78 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
@@ -117,8 +117,7 @@ static void output_stats(FIRSTPASS_STATS *stats,
stats->intra_skip_pct, stats->intra_smooth_pct,
stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
- stats->MVcv, stats->mv_in_out_count, stats->new_mv_count,
- stats->count, stats->duration);
+ stats->MVcv, stats->mv_in_out_count, stats->count, stats->duration);
fclose(fpfile);
}
#endif
@@ -157,7 +156,6 @@ static void zero_stats(FIRSTPASS_STATS *section) {
section->MVrv = 0.0;
section->MVcv = 0.0;
section->mv_in_out_count = 0.0;
- section->new_mv_count = 0.0;
section->count = 0.0;
section->duration = 1.0;
section->spatial_layer_id = 0;
@@ -187,7 +185,6 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
section->MVrv += frame->MVrv;
section->MVcv += frame->MVcv;
section->mv_in_out_count += frame->mv_in_out_count;
- section->new_mv_count += frame->new_mv_count;
section->count += frame->count;
section->duration += frame->duration;
}
@@ -215,7 +212,6 @@ static void subtract_stats(FIRSTPASS_STATS *section,
section->MVrv -= frame->MVrv;
section->MVcv -= frame->MVcv;
section->mv_in_out_count -= frame->mv_in_out_count;
- section->new_mv_count -= frame->new_mv_count;
section->count -= frame->count;
section->duration -= frame->duration;
}
@@ -679,9 +675,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
int intra_skip_count = 0;
int intra_smooth_count = 0;
int image_data_start_row = INVALID_ROW;
- int new_mv_count = 0;
int sum_in_vectors = 0;
- MV lastmv = { 0, 0 };
TWO_PASS *twopass = &cpi->twopass;
const MV zero_mv = { 0, 0 };
int recon_y_stride, recon_uv_stride, uv_mb_height;
@@ -1144,10 +1138,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
}
#endif
- // Non-zero vector, was it different from the last non zero vector?
- if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count;
- lastmv = mv;
-
// Does the row vector point inwards or outwards?
if (mb_row < cm->mb_rows / 2) {
if (mv.row > 0)
@@ -1263,7 +1253,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
fps.MVcv =
((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
- fps.new_mv_count = new_mv_count;
fps.pcnt_motion = (double)mvcount / num_mbs;
} else {
fps.MVr = 0.0;
@@ -1273,7 +1262,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
fps.MVrv = 0.0;
fps.MVcv = 0.0;
fps.mv_in_out_count = 0.0;
- fps.new_mv_count = 0.0;
fps.pcnt_motion = 0.0;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
index 6aa39cdc004..5541893dc89 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
@@ -61,7 +61,6 @@ typedef struct {
double MVrv;
double MVcv;
double mv_in_out_count;
- double new_mv_count;
double duration;
double count;
int64_t spatial_layer_id;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
index 2d9bcbda679..70deda84211 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
@@ -277,7 +277,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
const uint8_t *const z = x->plane[0].src.buf; \
const int src_stride = x->plane[0].src.stride; \
const MACROBLOCKD *xd = &x->e_mbd; \
- unsigned int besterr = INT_MAX; \
+ unsigned int besterr = UINT_MAX; \
unsigned int sse; \
unsigned int whichdir; \
int thismse; \
@@ -472,7 +472,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
(abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
- return INT_MAX;
+ return UINT_MAX;
return besterr;
}
@@ -622,7 +622,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned(
if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
(abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
- return INT_MAX;
+ return UINT_MAX;
return besterr;
}
@@ -646,7 +646,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
const uint8_t *const src_address = z;
const int src_stride = x->plane[0].src.stride;
const MACROBLOCKD *xd = &x->e_mbd;
- unsigned int besterr = INT_MAX;
+ unsigned int besterr = UINT_MAX;
unsigned int sse;
int thismse;
const int y_stride = xd->plane[0].pre[0].stride;
@@ -708,7 +708,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
*sse1 = sse;
}
} else {
- cost_array[idx] = INT_MAX;
+ cost_array[idx] = UINT_MAX;
}
}
@@ -737,7 +737,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
*sse1 = sse;
}
} else {
- cost_array[idx] = INT_MAX;
+ cost_array[idx] = UINT_MAX;
}
if (best_idx < 4 && best_idx >= 0) {
@@ -771,7 +771,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
(abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
- return INT_MAX;
+ return UINT_MAX;
return besterr;
}
@@ -2318,11 +2318,14 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
const struct buf_2d *const what = &x->plane[0].src;
const struct buf_2d *const in_what = &xd->plane[0].pre[0];
const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
- unsigned int best_sad =
+ unsigned int best_sad = INT_MAX;
+ int i, j;
+ clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ best_sad =
fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
in_what->stride, second_pred) +
mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
- int i, j;
for (i = 0; i < search_range; ++i) {
int best_site = -1;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c
index 0e5d8ade4ae..2252fe16b9d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c
@@ -26,21 +26,23 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
ne->level = kLowLow;
ne->value = 0;
ne->count = 0;
- ne->thresh = 90;
+ ne->thresh = 100;
ne->last_w = 0;
ne->last_h = 0;
if (width * height >= 1920 * 1080) {
ne->thresh = 200;
} else if (width * height >= 1280 * 720) {
- ne->thresh = 130;
+ ne->thresh = 140;
}
ne->num_frames_estimate = 20;
}
static int enable_noise_estimation(VP9_COMP *const cpi) {
-// Enable noise estimation if denoising is on.
+// Enable noise estimation if denoising is on, but not for low resolutions.
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0) return 1;
+ if (cpi->oxcf.noise_sensitivity > 0 && cpi->common.width >= 640 &&
+ cpi->common.height >= 360)
+ return 1;
#endif
// Only allow noise estimate under certain encoding mode.
// Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
@@ -50,7 +52,7 @@ static int enable_noise_estimation(VP9_COMP *const cpi) {
cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc &&
cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->common.width >= 640 &&
- cpi->common.height >= 480)
+ cpi->common.height >= 360)
return 1;
else
return 0;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
index 2b7ddbcd948..33f3f5a476c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
@@ -1259,16 +1259,17 @@ static void recheck_zeromv_after_denoising(
[INTER_OFFSET(ZEROMV)];
this_rdc.dist = dist;
this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
- // Switch to ZEROMV if the rdcost for ZEROMV on denoised source
- // is lower than best_ref mode (on original source).
+ // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source
+ // is higher than best_ref mode (on original source).
if (this_rdc.rdcost > best_rdc->rdcost) {
this_rdc = *best_rdc;
mi->mode = ctx_den->best_mode;
mi->ref_frame[0] = ctx_den->best_ref_frame;
mi->interp_filter = ctx_den->best_pred_filter;
- if (ctx_den->best_ref_frame == INTRA_FRAME)
+ if (ctx_den->best_ref_frame == INTRA_FRAME) {
mi->mv[0].as_int = INVALID_MV;
- else if (ctx_den->best_ref_frame == GOLDEN_FRAME) {
+ mi->interp_filter = SWITCHABLE_FILTERS;
+ } else if (ctx_den->best_ref_frame == GOLDEN_FRAME) {
mi->mv[0].as_int =
ctx_den->frame_mv[ctx_den->best_mode][ctx_den->best_ref_frame]
.as_int;
@@ -1395,6 +1396,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
int perform_intra_pred = 1;
int use_golden_nonzeromv = 1;
int force_skip_low_temp_var = 0;
+ int skip_ref_find_pred[4] = { 0 };
#if CONFIG_VP9_TEMPORAL_DENOISING
VP9_PICKMODE_CTX_DEN ctx_den;
int64_t zero_last_cost_orig = INT64_MAX;
@@ -1469,9 +1471,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
usable_ref_frame = GOLDEN_FRAME;
}
- if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
- (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref))
- usable_ref_frame = ALTREF_FRAME;
+ if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) {
+ if (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref)
+ usable_ref_frame = ALTREF_FRAME;
+
+ if (cpi->rc.is_src_frame_alt_ref) {
+ skip_ref_find_pred[LAST_FRAME] = 1;
+ skip_ref_find_pred[GOLDEN_FRAME] = 1;
+ }
+ }
// For svc mode, on spatial_layer_id > 0: if the reference has different scale
// constrain the inter mode to only test zero motion.
@@ -1490,6 +1498,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (cpi->sf.short_circuit_low_temp_var) {
force_skip_low_temp_var =
get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+ // If force_skip_low_temp_var is set, and for short circuit mode = 1 and 3,
+ // skip golden reference.
+ if ((cpi->sf.short_circuit_low_temp_var == 1 ||
+ cpi->sf.short_circuit_low_temp_var == 3) &&
+ force_skip_low_temp_var) {
+ usable_ref_frame = LAST_FRAME;
+ }
}
if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
@@ -1497,9 +1512,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
use_golden_nonzeromv = 0;
for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
- find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
- &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col,
- yv12_mb, bsize, force_skip_low_temp_var);
+ if (!skip_ref_find_pred[ref_frame]) {
+ find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
+ &ref_frame_skip_mask, flag_list, tile_data, mi_row,
+ mi_col, yv12_mb, bsize, force_skip_low_temp_var);
+ }
}
for (idx = 0; idx < RT_INTER_MODES; ++idx) {
@@ -1519,6 +1536,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
ref_frame = ref_mode_set_svc[idx].ref_frame;
}
if (ref_frame > usable_ref_frame) continue;
+ if (skip_ref_find_pred[ref_frame]) continue;
if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
this_mode != NEARESTMV) {
@@ -1558,7 +1576,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
continue;
}
- if (cpi->sf.short_circuit_low_temp_var == 2 && force_skip_low_temp_var &&
+ if (cpi->sf.short_circuit_low_temp_var >= 2 && force_skip_low_temp_var &&
ref_frame == LAST_FRAME && this_mode == NEWMV) {
continue;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
index b5cfd5de6c6..02059a70544 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -45,7 +45,7 @@
#define FRAME_OVERHEAD_BITS 200
-// Use this macro to turn on/off use of alt-refs in one-pass mode.
+// Use this macro to turn on/off use of alt-refs in one-pass vbr mode.
#define USE_ALTREF_FOR_ONE_PASS 0
#if CONFIG_VP9_HIGHBITDEPTH
@@ -414,7 +414,7 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) {
} else {
if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
!rc->is_src_frame_alt_ref && !cpi->use_svc &&
- (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+ (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100))
rcf = rc->rate_correction_factors[GF_ARF_STD];
else
rcf = rc->rate_correction_factors[INTER_NORMAL];
@@ -440,7 +440,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
} else {
if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
!rc->is_src_frame_alt_ref && !cpi->use_svc &&
- (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+ (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100))
rc->rate_correction_factors[GF_ARF_STD] = factor;
else
rc->rate_correction_factors[INTER_NORMAL] = factor;
@@ -560,15 +560,17 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
// In CBR mode, this makes sure q is between oscillating Qs to prevent
// resonance.
if (cpi->oxcf.rc_mode == VPX_CBR &&
+ (!cpi->oxcf.gf_cbr_boost_pct ||
+ !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
(cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
}
#if USE_ALTREF_FOR_ONE_PASS
- if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR &&
- cpi->oxcf.lag_in_frames > 0 && cpi->rc.is_src_frame_alt_ref &&
- !cpi->rc.alt_ref_gf_group) {
+ if (cpi->oxcf.enable_auto_arf && cpi->oxcf.pass == 0 &&
+ cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
+ cpi->rc.is_src_frame_alt_ref && !cpi->rc.alt_ref_gf_group) {
q = VPXMIN(q, (q + cpi->rc.last_boosted_qindex) >> 1);
}
#endif
@@ -1528,8 +1530,14 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
cpi->refresh_golden_frame = 1;
- rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
- rc->alt_ref_gf_group = USE_ALTREF_FOR_ONE_PASS;
+ rc->source_alt_ref_pending = 0;
+ rc->alt_ref_gf_group = 0;
+#if USE_ALTREF_FOR_ONE_PASS
+ if (cpi->oxcf.enable_auto_arf) {
+ rc->source_alt_ref_pending = 1;
+ rc->alt_ref_gf_group = 1;
+ }
+#endif
}
if (cm->frame_type == KEY_FRAME)
target = calc_iframe_target_size_one_pass_vbr(cpi);
@@ -2140,20 +2148,22 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
rc->gfu_boost = DEFAULT_GF_BOOST >> 2;
}
#if USE_ALTREF_FOR_ONE_PASS
- // Don't use alt-ref if there is a scene cut within the group,
- // or content is not low.
- if ((rc->high_source_sad_lagindex > 0 &&
- rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) ||
- (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) {
- rc->source_alt_ref_pending = 0;
- rc->alt_ref_gf_group = 0;
- } else {
- rc->source_alt_ref_pending = 1;
- rc->alt_ref_gf_group = 1;
- // If alt-ref is used for this gf group, limit the interval.
- if (rc->baseline_gf_interval > 10 &&
- rc->baseline_gf_interval < rc->frames_to_key)
- rc->baseline_gf_interval = 10;
+ if (cpi->oxcf.enable_auto_arf) {
+ // Don't use alt-ref if there is a scene cut within the group,
+ // or content is not low.
+ if ((rc->high_source_sad_lagindex > 0 &&
+ rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) ||
+ (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) {
+ rc->source_alt_ref_pending = 0;
+ rc->alt_ref_gf_group = 0;
+ } else {
+ rc->source_alt_ref_pending = 1;
+ rc->alt_ref_gf_group = 1;
+ // If alt-ref is used for this gf group, limit the interval.
+ if (rc->baseline_gf_interval > 10 &&
+ rc->baseline_gf_interval < rc->frames_to_key)
+ rc->baseline_gf_interval = 10;
+ }
}
#endif
target = calc_pframe_target_size_one_pass_vbr(cpi);
@@ -2243,10 +2253,12 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
// Checker-board pattern, ignore boundary.
- if ((sbi_row > 0 && sbi_col > 0) &&
- (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
- ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
- (sbi_row % 2 != 0 && sbi_col % 2 != 0))) {
+ // If the partition copy is on, compute for every superblock.
+ if (cpi->sf.copy_partition_flag ||
+ ((sbi_row > 0 && sbi_col > 0) &&
+ (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
+ ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
+ (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) {
num_samples++;
avg_sad += cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
last_src_ystride);
@@ -2284,7 +2296,10 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
cpi->ext_refresh_frame_flags_pending == 0) {
int target;
cpi->refresh_golden_frame = 1;
- rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+ rc->source_alt_ref_pending = 0;
+#if USE_ALTREF_FOR_ONE_PASS
+ if (cpi->oxcf.enable_auto_arf) rc->source_alt_ref_pending = 1;
+#endif
rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
rc->baseline_gf_interval =
VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval));
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
index 3e1ed50a6d2..81cb431ba58 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
@@ -182,6 +182,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->mv.subpel_iters_per_step = 1;
sf->mode_skip_start = 10;
sf->adaptive_pred_interp_filter = 1;
+ sf->allow_acl = 0;
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
@@ -309,6 +310,8 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
sf->use_fast_coef_costing = 1;
sf->allow_exhaustive_searches = 0;
sf->exhaustive_searches_thresh = INT_MAX;
+ sf->allow_acl = 0;
+ sf->copy_partition_flag = 0;
if (speed >= 1) {
sf->allow_txfm_domain_distortion = 1;
@@ -494,6 +497,18 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
if (speed >= 8) {
sf->adaptive_rd_thresh = 4;
+ // Disabled for now until the threshold is tuned.
+ sf->copy_partition_flag = 0;
+ if (sf->copy_partition_flag) {
+ if (cpi->prev_partition == NULL) {
+ cpi->prev_partition = (BLOCK_SIZE *)vpx_calloc(
+ cm->mi_stride * cm->mi_rows, sizeof(BLOCK_SIZE));
+ }
+ if (cpi->prev_segment_id == NULL) {
+ cpi->prev_segment_id =
+ (int8_t *)vpx_calloc(cm->mi_stride * cm->mi_rows, sizeof(int8_t));
+ }
+ }
sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
// Only keep INTRA_DC mode for speed 8.
@@ -505,7 +520,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
content != VP9E_CONTENT_SCREEN) {
// More aggressive short circuit for speed 8.
- sf->short_circuit_low_temp_var = 2;
+ sf->short_circuit_low_temp_var = 3;
}
sf->limit_newmv_early_exit = 0;
}
@@ -592,6 +607,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
sf->tx_domain_thresh = 99.0;
sf->allow_quant_coeff_opt = sf->optimize_coefficients;
sf->quant_opt_thresh = 99.0;
+ sf->allow_acl = 1;
for (i = 0; i < TX_SIZES; i++) {
sf->intra_y_mode_mask[i] = INTRA_ALL;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
index 6d0b9420a1d..944fe6322fb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
@@ -244,6 +244,10 @@ typedef struct SPEED_FEATURES {
int allow_quant_coeff_opt;
double quant_opt_thresh;
+ // Enable asymptotic closed-loop encoding decision for key frame and
+ // alternate reference frames.
+ int allow_acl;
+
// Use transform domain distortion. Use pixel domain distortion in speed 0
// and certain situations in higher speed to improve the RD model precision.
int allow_txfm_domain_distortion;
@@ -452,11 +456,13 @@ typedef struct SPEED_FEATURES {
int short_circuit_flat_blocks;
// Skip a number of expensive mode evaluations for blocks with very low
- // temporal variance.
- // 1: Skip golden non-zeromv and ALL INTRA for bsize >= 32x32.
+ // temporal variance. If the low temporal variance flag is set for a block,
+ // do the following:
+ // 1: Skip all golden modes and ALL INTRA for bsize >= 32x32.
// 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL
// INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and
// 32x16.
+ // 3: Same as (2), but also skip golden zeromv.
int short_circuit_low_temp_var;
// Limits the rd-threshold update for early exit for the newmv-last mode,
@@ -469,6 +475,9 @@ typedef struct SPEED_FEATURES {
// Bias to use base mv and skip 1/4 subpel search when use base mv in
// enhancement layer.
int base_mv_aggressive;
+
+ // Global flag to enable partition copy from the previous frame.
+ int copy_partition_flag;
} SPEED_FEATURES;
struct VP9_COMP;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
index 2d29e268b1f..1d892dc148b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -650,6 +650,21 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
lc->scaling_factor_num, lc->scaling_factor_den, &width,
&height);
+ // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use
+ // of base motion vectors if spatial scale factors for any layers are not 2.
+ // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2.
+ if (cpi->svc.number_spatial_layers > 1) {
+ int sl;
+ for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) {
+ lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers +
+ cpi->svc.temporal_layer_id];
+ if (lc->scaling_factor_num != lc->scaling_factor_den >> 1) {
+ cpi->svc.use_base_mv = 0;
+ break;
+ }
+ }
+ }
+
if (vp9_set_size_literal(cpi, width, height) != 0)
return VPX_CODEC_INVALID_PARAM;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
index a167eeb15de..344658483a1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -208,17 +208,17 @@ void vp9_highbd_temporal_filter_apply_c(
}
#endif // CONFIG_VP9_HIGHBITDEPTH
-static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
- uint8_t *arf_frame_buf,
- uint8_t *frame_ptr_buf,
- int stride) {
+static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
+ uint8_t *arf_frame_buf,
+ uint8_t *frame_ptr_buf,
+ int stride) {
MACROBLOCK *const x = &cpi->td.mb;
MACROBLOCKD *const xd = &x->e_mbd;
MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
const SEARCH_METHODS old_search_method = mv_sf->search_method;
int step_param;
int sadpb = x->sadperbit16;
- int bestsme = INT_MAX;
+ uint32_t bestsme = UINT_MAX;
uint32_t distortion;
uint32_t sse;
int cost_list[5];
@@ -334,8 +334,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
for (frame = 0; frame < frame_count; frame++) {
- const int thresh_low = 10000;
- const int thresh_high = 20000;
+ const uint32_t thresh_low = 10000;
+ const uint32_t thresh_high = 20000;
if (frames[frame] == NULL) continue;
@@ -346,7 +346,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
filter_weight = 2;
} else {
// Find best match in this frame by MC
- int err = temporal_filter_find_matching_mb_c(
+ uint32_t err = temporal_filter_find_matching_mb_c(
cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
index a797b2c2624..e6cea080d16 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
@@ -157,6 +157,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
RANGE_CHECK_BOOL(extra_cfg, lossless);
+ RANGE_CHECK_BOOL(extra_cfg, frame_parallel_decoding_mode);
RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2);
RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1);
RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
@@ -389,6 +390,50 @@ static int get_image_bps(const vpx_image_t *img) {
return 0;
}
+// Modify the encoder config for the target level.
+static void config_target_level(VP9EncoderConfig *oxcf) {
+ double max_average_bitrate; // in bits per second
+ int max_over_shoot_pct;
+ const int target_level_index = get_level_index(oxcf->target_level);
+
+ vpx_clear_system_state();
+ assert(target_level_index >= 0);
+ assert(target_level_index < VP9_LEVELS);
+
+ // Maximum target bit-rate is level_limit * 80%.
+ max_average_bitrate =
+ vp9_level_defs[target_level_index].average_bitrate * 800.0;
+ if ((double)oxcf->target_bandwidth > max_average_bitrate)
+ oxcf->target_bandwidth = (int64_t)(max_average_bitrate);
+ if (oxcf->ss_number_layers == 1 && oxcf->pass != 0)
+ oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
+
+ // Adjust max over-shoot percentage.
+ max_over_shoot_pct =
+ (int)((max_average_bitrate * 1.10 - (double)oxcf->target_bandwidth) *
+ 100 / (double)(oxcf->target_bandwidth));
+ if (oxcf->over_shoot_pct > max_over_shoot_pct)
+ oxcf->over_shoot_pct = max_over_shoot_pct;
+
+ // Adjust worst allowed quantizer.
+ oxcf->worst_allowed_q = vp9_quantizer_to_qindex(63);
+
+ // Adjust minimum art-ref distance.
+ if (oxcf->min_gf_interval <
+ (int)vp9_level_defs[target_level_index].min_altref_distance)
+ oxcf->min_gf_interval =
+ (int)vp9_level_defs[target_level_index].min_altref_distance;
+
+ // Adjust maximum column tiles.
+ if (vp9_level_defs[target_level_index].max_col_tiles <
+ (1 << oxcf->tile_columns)) {
+ while (oxcf->tile_columns > 0 &&
+ vp9_level_defs[target_level_index].max_col_tiles <
+ (1 << oxcf->tile_columns))
+ --oxcf->tile_columns;
+ }
+}
+
static vpx_codec_err_t set_encoder_config(
VP9EncoderConfig *oxcf, const vpx_codec_enc_cfg_t *cfg,
const struct vp9_extracfg *extra_cfg) {
@@ -532,6 +577,8 @@ static vpx_codec_err_t set_encoder_config(
} else if (oxcf->ts_number_layers == 1) {
oxcf->ts_rate_decimator[0] = 1;
}
+
+ if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf);
/*
printf("Current VP9 Settings: \n");
printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
@@ -1002,6 +1049,28 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
if (cpi == NULL) return VPX_CODEC_INVALID_PARAM;
+ if (cpi->oxcf.pass == 2 && cpi->level_constraint.level_index >= 0 &&
+ !cpi->level_constraint.rc_config_updated) {
+ SVC *const svc = &cpi->svc;
+ const int is_two_pass_svc =
+ (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1);
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ FIRSTPASS_STATS *stats = &twopass->total_stats;
+ if (is_two_pass_svc) {
+ const double frame_rate = 10000000.0 * stats->count / stats->duration;
+ vp9_update_spatial_layer_framerate(cpi, frame_rate);
+ twopass->bits_left =
+ (int64_t)(stats->duration *
+ svc->layer_context[svc->spatial_layer_id].target_bandwidth /
+ 10000000.0);
+ } else {
+ twopass->bits_left =
+ (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+ }
+ cpi->level_constraint.rc_config_updated = 1;
+ }
+
if (img != NULL) {
res = validate_img(ctx, img);
if (res == VPX_CODEC_OK) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
index 88b1531d8c4..c2f80d88515 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
@@ -201,7 +201,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
char *input_string;
char *option_name;
char *option_value;
- char *input_ptr;
+ char *input_ptr = NULL;
SvcInternal_t *const si = get_svc_internal(svc_ctx);
vpx_codec_err_t res = VPX_CODEC_OK;
int i, alt_ref_enabled = 0;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c
new file mode 100644
index 00000000000..1fb41d29920
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c
@@ -0,0 +1,485 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+extern const int16_t vpx_rv[];
+
+static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1,
+ const uint8x8_t v0, const uint8x8_t b1,
+ const uint8x8_t b2) {
+ const uint8x8_t k1 = vrhadd_u8(a2, a1);
+ const uint8x8_t k2 = vrhadd_u8(b2, b1);
+ const uint8x8_t k3 = vrhadd_u8(k1, k2);
+ return vrhadd_u8(k3, v0);
+}
+
+static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1,
+ const uint8x8_t v0, const uint8x8_t b1,
+ const uint8x8_t b2, const uint8x8_t filter) {
+ const uint8x8_t a2_v0 = vabd_u8(a2, v0);
+ const uint8x8_t a1_v0 = vabd_u8(a1, v0);
+ const uint8x8_t b1_v0 = vabd_u8(b1, v0);
+ const uint8x8_t b2_v0 = vabd_u8(b2, v0);
+
+ uint8x8_t max = vmax_u8(a2_v0, a1_v0);
+ max = vmax_u8(b1_v0, max);
+ max = vmax_u8(b2_v0, max);
+ return vclt_u8(max, filter);
+}
+
+static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1,
+ const uint8x8_t v0, const uint8x8_t b1,
+ const uint8x8_t b2, const uint8x8_t filter) {
+ const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2);
+ const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter);
+
+ return vbsl_u8(mask, k_out, v0);
+}
+
+// Same functions but for uint8x16_t.
+static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1,
+ const uint8x16_t v0, const uint8x16_t b1,
+ const uint8x16_t b2) {
+ const uint8x16_t k1 = vrhaddq_u8(a2, a1);
+ const uint8x16_t k2 = vrhaddq_u8(b2, b1);
+ const uint8x16_t k3 = vrhaddq_u8(k1, k2);
+ return vrhaddq_u8(k3, v0);
+}
+
+static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1,
+ const uint8x16_t v0, const uint8x16_t b1,
+ const uint8x16_t b2, const uint8x16_t filter) {
+ const uint8x16_t a2_v0 = vabdq_u8(a2, v0);
+ const uint8x16_t a1_v0 = vabdq_u8(a1, v0);
+ const uint8x16_t b1_v0 = vabdq_u8(b1, v0);
+ const uint8x16_t b2_v0 = vabdq_u8(b2, v0);
+
+ uint8x16_t max = vmaxq_u8(a2_v0, a1_v0);
+ max = vmaxq_u8(b1_v0, max);
+ max = vmaxq_u8(b2_v0, max);
+ return vcltq_u8(max, filter);
+}
+
+static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1,
+ const uint8x16_t v0, const uint8x16_t b1,
+ const uint8x16_t b2,
+ const uint8x16_t filter) {
+ const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2);
+ const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter);
+
+ return vbslq_u8(mask, k_out, v0);
+}
+
+void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int dst_stride, int cols,
+ uint8_t *f, int size) {
+ uint8_t *src, *dst;
+ int row;
+ int col;
+
+ // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for
+ // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8
+ // (for U/V).
+ assert((size == 8 || size == 16) && cols % 8 == 0);
+
+ // While columns of length 16 can be processed, load them.
+ for (col = 0; col < cols - 8; col += 16) {
+ uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7;
+ src = src_ptr - 2 * src_stride;
+ dst = dst_ptr;
+
+ a0 = vld1q_u8(src);
+ src += src_stride;
+ a1 = vld1q_u8(src);
+ src += src_stride;
+ a2 = vld1q_u8(src);
+ src += src_stride;
+ a3 = vld1q_u8(src);
+ src += src_stride;
+
+ for (row = 0; row < size; row += 4) {
+ uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3;
+ const uint8x16_t filterq = vld1q_u8(f + col);
+
+ a4 = vld1q_u8(src);
+ src += src_stride;
+ a5 = vld1q_u8(src);
+ src += src_stride;
+ a6 = vld1q_u8(src);
+ src += src_stride;
+ a7 = vld1q_u8(src);
+ src += src_stride;
+
+ v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq);
+ v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq);
+ v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq);
+ v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq);
+
+ vst1q_u8(dst, v_out_0);
+ dst += dst_stride;
+ vst1q_u8(dst, v_out_1);
+ dst += dst_stride;
+ vst1q_u8(dst, v_out_2);
+ dst += dst_stride;
+ vst1q_u8(dst, v_out_3);
+ dst += dst_stride;
+
+ // Rotate over to the next slot.
+ a0 = a4;
+ a1 = a5;
+ a2 = a6;
+ a3 = a7;
+ }
+
+ src_ptr += 16;
+ dst_ptr += 16;
+ }
+
+ // Clean up any left over column of length 8.
+ if (col != cols) {
+ uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
+ src = src_ptr - 2 * src_stride;
+ dst = dst_ptr;
+
+ a0 = vld1_u8(src);
+ src += src_stride;
+ a1 = vld1_u8(src);
+ src += src_stride;
+ a2 = vld1_u8(src);
+ src += src_stride;
+ a3 = vld1_u8(src);
+ src += src_stride;
+
+ for (row = 0; row < size; row += 4) {
+ uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3;
+ const uint8x8_t filter = vld1_u8(f + col);
+
+ a4 = vld1_u8(src);
+ src += src_stride;
+ a5 = vld1_u8(src);
+ src += src_stride;
+ a6 = vld1_u8(src);
+ src += src_stride;
+ a7 = vld1_u8(src);
+ src += src_stride;
+
+ v_out_0 = generate_output(a0, a1, a2, a3, a4, filter);
+ v_out_1 = generate_output(a1, a2, a3, a4, a5, filter);
+ v_out_2 = generate_output(a2, a3, a4, a5, a6, filter);
+ v_out_3 = generate_output(a3, a4, a5, a6, a7, filter);
+
+ vst1_u8(dst, v_out_0);
+ dst += dst_stride;
+ vst1_u8(dst, v_out_1);
+ dst += dst_stride;
+ vst1_u8(dst, v_out_2);
+ dst += dst_stride;
+ vst1_u8(dst, v_out_3);
+ dst += dst_stride;
+
+ // Rotate over to the next slot.
+ a0 = a4;
+ a1 = a5;
+ a2 = a6;
+ a3 = a7;
+ }
+
+ // Not strictly necessary but makes resetting dst_ptr easier.
+ dst_ptr += 8;
+ }
+
+ dst_ptr -= cols;
+
+ for (row = 0; row < size; row += 8) {
+ uint8x8_t a0, a1, a2, a3;
+ uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+ src = dst_ptr;
+ dst = dst_ptr;
+
+ // Load 8 values, transpose 4 of them, and discard 2 because they will be
+ // reloaded later.
+ load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3);
+ a3 = a1;
+ a2 = a1 = a0; // Extend left border.
+
+ src += 2;
+
+ for (col = 0; col < cols; col += 8) {
+ uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6,
+ v_out_7;
+ // Although the filter is meant to be applied vertically and is instead
+ // being applied horizontally here it's OK because it's set in blocks of 8
+ // (or 16).
+ const uint8x8_t filter = vld1_u8(f + col);
+
+ load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5,
+ &b6, &b7);
+
+ if (col + 8 == cols) {
+ // Last row. Extend border (b5).
+ b6 = b7 = b5;
+ }
+
+ v_out_0 = generate_output(a0, a1, a2, a3, b0, filter);
+ v_out_1 = generate_output(a1, a2, a3, b0, b1, filter);
+ v_out_2 = generate_output(a2, a3, b0, b1, b2, filter);
+ v_out_3 = generate_output(a3, b0, b1, b2, b3, filter);
+ v_out_4 = generate_output(b0, b1, b2, b3, b4, filter);
+ v_out_5 = generate_output(b1, b2, b3, b4, b5, filter);
+ v_out_6 = generate_output(b2, b3, b4, b5, b6, filter);
+ v_out_7 = generate_output(b3, b4, b5, b6, b7, filter);
+
+ transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2,
+ v_out_3, v_out_4, v_out_5, v_out_6, v_out_7);
+
+ a0 = b4;
+ a1 = b5;
+ a2 = b6;
+ a3 = b7;
+
+ src += 8;
+ dst += 8;
+ }
+
+ dst_ptr += 8 * dst_stride;
+ }
+}
+
+// sum += x;
+// sumsq += x * y;
+static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy,
+ int16x4_t *const sum, int32x4_t *const sumsq) {
+ const int16x4_t zero = vdup_n_s16(0);
+ const int32x4_t zeroq = vdupq_n_s32(0);
+
+ // Add in the first set because vext doesn't work with '0'.
+ *sum = vadd_s16(*sum, x);
+ *sumsq = vaddq_s32(*sumsq, xy);
+
+ // Shift x and xy to the right and sum. vext requires an immediate.
+ *sum = vadd_s16(*sum, vext_s16(zero, x, 1));
+ *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1));
+
+ *sum = vadd_s16(*sum, vext_s16(zero, x, 2));
+ *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2));
+
+ *sum = vadd_s16(*sum, vext_s16(zero, x, 3));
+ *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3));
+}
+
+// Generate mask based on (sumsq * 15 - sum * sum < flimit)
+static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq,
+ const int32x4_t f, const int32x4_t fifteen) {
+ const int32x4_t a = vmulq_s32(sumsq, fifteen);
+ const int32x4_t b = vmlsl_s16(a, sum, sum);
+ const uint32x4_t mask32 = vcltq_s32(b, f);
+ return vmovn_u32(mask32);
+}
+
+static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high,
+ const int32x4_t sumsq_low,
+ const int32x4_t sumsq_high, const int32x4_t f) {
+ const int32x4_t fifteen = vdupq_n_s32(15);
+ const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen);
+ const uint16x4_t mask16_high =
+ calculate_mask(sum_high, sumsq_high, f, fifteen);
+ return vmovn_u16(vcombine_u16(mask16_low, mask16_high));
+}
+
+// Apply filter of (8 + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) {
+ const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+ const int16x8_t sum_s = vaddq_s16(sum, s16);
+
+ return vqrshrun_n_s16(sum_s, 4);
+}
+
+void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols,
+ int flimit) {
+ int row, col;
+ const int32x4_t f = vdupq_n_s32(flimit);
+
+ assert(cols % 8 == 0);
+
+ for (row = 0; row < rows; ++row) {
+ // Sum the first 8 elements, which are extended from s[0].
+ // sumsq gets primed with +16.
+ int sumsq = src[0] * src[0] * 9 + 16;
+ int sum = src[0] * 9;
+
+ uint8x8_t left_context, s, right_context;
+ int16x4_t sum_low, sum_high;
+ int32x4_t sumsq_low, sumsq_high;
+
+ // Sum (+square) the next 6 elements.
+ // Skip [0] because it's included above.
+ for (col = 1; col <= 6; ++col) {
+ sumsq += src[col] * src[col];
+ sum += src[col];
+ }
+
+ // Prime the sums. Later the loop uses the _high values to prime the new
+ // vectors.
+ sumsq_high = vdupq_n_s32(sumsq);
+ sum_high = vdup_n_s16(sum);
+
+ // Manually extend the left border.
+ left_context = vdup_n_u8(src[0]);
+
+ for (col = 0; col < cols; col += 8) {
+ uint8x8_t mask, output;
+ int16x8_t x, y;
+ int32x4_t xy_low, xy_high;
+
+ s = vld1_u8(src + col);
+
+ if (col + 8 == cols) {
+ // Last row. Extend border.
+ right_context = vdup_n_u8(src[col + 7]);
+ } else {
+ right_context = vld1_u8(src + col + 7);
+ }
+
+ x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context));
+ y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context));
+ xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+ xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+ // Catch up to the last sum'd value.
+ sum_low = vdup_lane_s16(sum_high, 3);
+ sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1);
+
+ accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low);
+
+ // Need to do this sequentially because we need the max value from
+ // sum_low.
+ sum_high = vdup_lane_s16(sum_low, 3);
+ sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1);
+
+ accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high);
+
+ mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f);
+
+ output = filter_pixels(vcombine_s16(sum_low, sum_high), s);
+ output = vbsl_u8(mask, output, s);
+
+ vst1_u8(src + col, output);
+
+ left_context = s;
+ }
+
+ src += pitch;
+ }
+}
+
+// Apply filter of (vpx_rv + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s,
+ const int16x8_t rv) {
+ const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+ const int16x8_t sum_s = vaddq_s16(sum, s16);
+ const int16x8_t rounded = vaddq_s16(sum_s, rv);
+
+ return vqshrun_n_s16(rounded, 4);
+}
+
+void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols,
+ int flimit) {
+ int row, col, i;
+ const int32x4_t f = vdupq_n_s32(flimit);
+ uint8x8_t below_context = vdup_n_u8(0);
+
+ // 8 columns are processed at a time.
+ // If rows is less than 8 the bottom border extension fails.
+ assert(cols % 8 == 0);
+ assert(rows >= 8);
+
+ // Load and keep the first 8 values in memory. Process a vertical stripe that
+ // is 8 wide.
+ for (col = 0; col < cols; col += 8) {
+ uint8x8_t s, above_context[8];
+ int16x8_t sum, sum_tmp;
+ int32x4_t sumsq_low, sumsq_high;
+
+ // Load and extend the top border.
+ s = vld1_u8(dst);
+ for (i = 0; i < 8; i++) {
+ above_context[i] = s;
+ }
+
+ sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s));
+
+ // sum * 9
+ sum = vmulq_n_s16(sum_tmp, 9);
+
+ // (sum * 9) * sum == sum * sum * 9
+ sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp));
+ sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp));
+
+ // Load and discard the next 6 values to prime sum and sumsq.
+ for (i = 1; i <= 6; ++i) {
+ const uint8x8_t a = vld1_u8(dst + i * pitch);
+ const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a));
+ sum = vaddq_s16(sum, b);
+
+ sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b));
+ sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b));
+ }
+
+ for (row = 0; row < rows; ++row) {
+ uint8x8_t mask, output;
+ int16x8_t x, y;
+ int32x4_t xy_low, xy_high;
+
+ s = vld1_u8(dst + row * pitch);
+
+ // Extend the bottom border.
+ if (row + 7 < rows) {
+ below_context = vld1_u8(dst + (row + 7) * pitch);
+ }
+
+ x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0]));
+ y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0]));
+ xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+ xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+ sum = vaddq_s16(sum, x);
+
+ sumsq_low = vaddq_s32(sumsq_low, xy_low);
+ sumsq_high = vaddq_s32(sumsq_high, xy_high);
+
+ mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low,
+ sumsq_high, f);
+
+ output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127)));
+ output = vbsl_u8(mask, output, s);
+
+ vst1_u8(dst + row * pitch, output);
+
+ above_context[0] = above_context[1];
+ above_context[1] = above_context[2];
+ above_context[2] = above_context[3];
+ above_context[3] = above_context[4];
+ above_context[4] = above_context[5];
+ above_context[5] = above_context[6];
+ above_context[6] = above_context[7];
+ above_context[7] = s;
+ }
+
+ dst += 8;
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
new file mode 100644
index 00000000000..26fa3e216bb
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x4_t a0 = vld1_u16(*dest);
+ const uint16x4_t a1 = vld1_u16(*dest + stride);
+ const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
+ // Note: In some profile tests, res is quite close to +/-32767.
+ // We use saturating addition.
+ const int16x8_t b = vqaddq_s16(res, a);
+ const int16x8_t c = vminq_s16(b, max);
+ const uint16x8_t d = vqshluq_n_s16(c, 0);
+ vst1_u16(*dest, vget_low_u16(d));
+ *dest += stride;
+ vst1_u16(*dest, vget_high_u16(d));
+ *dest += stride;
+}
+
+// res is in reverse row order
+static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x4_t a0 = vld1_u16(*dest);
+ const uint16x4_t a1 = vld1_u16(*dest + stride);
+ const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0));
+ // Note: In some profile tests, res is quite close to +/-32767.
+ // We use saturating addition.
+ const int16x8_t b = vqaddq_s16(res, a);
+ const int16x8_t c = vminq_s16(b, max);
+ const uint16x8_t d = vqshluq_n_s16(c, 0);
+ vst1_u16(*dest, vget_high_u16(d));
+ *dest += stride;
+ vst1_u16(*dest, vget_low_u16(d));
+ *dest += stride;
+}
+
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ const tran_low_t out0 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ const tran_low_t out1 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
+ highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
+}
+
+static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
+ int32x4_t *const a0,
+ int32x4_t *const a1,
+ int32x4_t *const a2,
+ int32x4_t *const a3) {
+ int32x4_t b0, b1, b2, b3;
+
+ transpose_s32_4x4(a0, a1, a2, a3);
+ b0 = vaddq_s32(*a0, *a2);
+ b1 = vsubq_s32(*a0, *a2);
+ b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
+ b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
+ b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1);
+ b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
+ b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
+ b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
+ b0 = vrshrq_n_s32(b0, 14);
+ b1 = vrshrq_n_s32(b1, 14);
+ b2 = vrshrq_n_s32(b2, 14);
+ b3 = vrshrq_n_s32(b3, 14);
+ *a0 = vaddq_s32(b0, b3);
+ *a1 = vaddq_s32(b1, b2);
+ *a2 = vsubq_s32(b1, b2);
+ *a3 = vsubq_s32(b0, b3);
+}
+
+static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
+ int32x4_t *const a0,
+ int32x4_t *const a1,
+ int32x4_t *const a2,
+ int32x4_t *const a3) {
+ int32x4_t b0, b1, b2, b3;
+ int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11;
+
+ transpose_s32_4x4(a0, a1, a2, a3);
+ b0 = vaddq_s32(*a0, *a2);
+ b1 = vsubq_s32(*a0, *a2);
+ c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
+ c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
+ c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
+ c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
+ c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1);
+ c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1);
+ c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1);
+ c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1);
+ c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1);
+ c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1);
+ c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1);
+ c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1);
+ c4 = vsubq_s64(c4, c8);
+ c5 = vsubq_s64(c5, c9);
+ c6 = vaddq_s64(c6, c10);
+ c7 = vaddq_s64(c7, c11);
+ b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14));
+ b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14));
+ b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14));
+ b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14));
+ *a0 = vaddq_s32(b0, b3);
+ *a1 = vaddq_s32(b1, b2);
+ *a2 = vsubq_s32(b1, b2);
+ *a3 = vsubq_s32(b0, b3);
+}
+
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ int32x4_t c0 = vld1q_s32(input);
+ int32x4_t c1 = vld1q_s32(input + 4);
+ int32x4_t c2 = vld1q_s32(input + 8);
+ int32x4_t c3 = vld1q_s32(input + 12);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ int16x8_t a0, a1;
+
+ if (bd == 8) {
+ const int16x4_t cospis = vld1_s16(kCospi);
+
+ // Rows
+ a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1));
+ a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3));
+ idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+
+ // Columns
+ a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
+ idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+ a0 = vrshrq_n_s16(a0, 4);
+ a1 = vrshrq_n_s16(a1, 4);
+ } else {
+ const int32x4_t cospis = vld1q_s32(kCospi32);
+
+ if (bd == 10) {
+ idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
+ idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
+ } else {
+ idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
+ idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
+ }
+ a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4));
+ a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4));
+ }
+
+ highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max);
+ highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
new file mode 100644
index 00000000000..c1c0f645d18
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x8_t a = vld1q_u16(*dest);
+ const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+ const int16x8_t c = vminq_s16(b, max);
+ const uint16x8_t d = vqshluq_n_s16(c, 0);
+ vst1q_u16(*dest, d);
+ *dest += stride;
+}
+
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ const tran_low_t out0 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ const tran_low_t out1 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+}
+
+static INLINE void idct8x8_12_half1d_bd10(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x4_t step1[8], step2[8];
+
+ transpose_s32_4x4(io0, io1, io2, io3);
+
+ // stage 1
+ step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+ step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+ step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+ step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+ step1[4] = vrshrq_n_s32(step1[4], 14);
+ step1[5] = vrshrq_n_s32(step1[5], 14);
+ step1[6] = vrshrq_n_s32(step1[6], 14);
+ step1[7] = vrshrq_n_s32(step1[7], 14);
+
+ // stage 2
+ step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+ step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+ step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+ step2[1] = vrshrq_n_s32(step2[1], 14);
+ step2[2] = vrshrq_n_s32(step2[2], 14);
+ step2[3] = vrshrq_n_s32(step2[3], 14);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[1], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[1], step2[3]);
+
+ step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+ step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[5] = vrshrq_n_s32(step1[5], 14);
+ step1[6] = vrshrq_n_s32(step1[6], 14);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_half1d_bd12(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x2_t input_1l, input_1h, input_3l, input_3h;
+ int32x2_t step1l[2], step1h[2];
+ int32x4_t step1[8], step2[8];
+ int64x2_t t64[8];
+ int32x2_t t32[8];
+
+ transpose_s32_4x4(io0, io1, io2, io3);
+
+ // stage 1
+ input_1l = vget_low_s32(*io1);
+ input_1h = vget_high_s32(*io1);
+ input_3l = vget_low_s32(*io3);
+ input_3h = vget_high_s32(*io3);
+ step1l[0] = vget_low_s32(*io0);
+ step1h[0] = vget_high_s32(*io0);
+ step1l[1] = vget_low_s32(*io2);
+ step1h[1] = vget_high_s32(*io2);
+
+ t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
+ t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
+ t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
+ t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
+ t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
+ t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
+ t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
+ t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
+ t32[0] = vrshrn_n_s64(t64[0], 14);
+ t32[1] = vrshrn_n_s64(t64[1], 14);
+ t32[2] = vrshrn_n_s64(t64[2], 14);
+ t32[3] = vrshrn_n_s64(t64[3], 14);
+ t32[4] = vrshrn_n_s64(t64[4], 14);
+ t32[5] = vrshrn_n_s64(t64[5], 14);
+ t32[6] = vrshrn_n_s64(t64[6], 14);
+ t32[7] = vrshrn_n_s64(t64[7], 14);
+ step1[4] = vcombine_s32(t32[0], t32[1]);
+ step1[5] = vcombine_s32(t32[2], t32[3]);
+ step1[6] = vcombine_s32(t32[4], t32[5]);
+ step1[7] = vcombine_s32(t32[6], t32[7]);
+
+ // stage 2
+ t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+ t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+ t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+ t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+ t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+ t32[2] = vrshrn_n_s64(t64[2], 14);
+ t32[3] = vrshrn_n_s64(t64[3], 14);
+ t32[4] = vrshrn_n_s64(t64[4], 14);
+ t32[5] = vrshrn_n_s64(t64[5], 14);
+ t32[6] = vrshrn_n_s64(t64[6], 14);
+ t32[7] = vrshrn_n_s64(t64[7], 14);
+ step2[1] = vcombine_s32(t32[2], t32[3]);
+ step2[2] = vcombine_s32(t32[4], t32[5]);
+ step2[3] = vcombine_s32(t32[6], t32[7]);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[1], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[1], step2[3]);
+
+ t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[0] =
+ vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t64[2] =
+ vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t32[0] = vrshrn_n_s64(t64[0], 14);
+ t32[1] = vrshrn_n_s64(t64[1], 14);
+ t32[2] = vrshrn_n_s64(t64[2], 14);
+ t32[3] = vrshrn_n_s64(t64[3], 14);
+ step1[5] = vcombine_s32(t32[0], t32[1]);
+ step1[6] = vcombine_s32(t32[2], t32[3]);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
+ int16x8_t a3, int16x8_t a4, int16x8_t a5,
+ int16x8_t a6, int16x8_t a7, uint16_t *dest,
+ const int stride, const int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ const uint16_t *dst = dest;
+ uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+ uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
+ int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
+
+ d0 = vld1q_u16(dst);
+ dst += stride;
+ d1 = vld1q_u16(dst);
+ dst += stride;
+ d2 = vld1q_u16(dst);
+ dst += stride;
+ d3 = vld1q_u16(dst);
+ dst += stride;
+ d4 = vld1q_u16(dst);
+ dst += stride;
+ d5 = vld1q_u16(dst);
+ dst += stride;
+ d6 = vld1q_u16(dst);
+ dst += stride;
+ d7 = vld1q_u16(dst);
+
+ d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0));
+ d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1));
+ d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2));
+ d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3));
+ d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4));
+ d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5));
+ d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6));
+ d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7));
+
+ d0_s16 = vminq_s16(d0_s16, max);
+ d1_s16 = vminq_s16(d1_s16, max);
+ d2_s16 = vminq_s16(d2_s16, max);
+ d3_s16 = vminq_s16(d3_s16, max);
+ d4_s16 = vminq_s16(d4_s16, max);
+ d5_s16 = vminq_s16(d5_s16, max);
+ d6_s16 = vminq_s16(d6_s16, max);
+ d7_s16 = vminq_s16(d7_s16, max);
+ d0_u16 = vqshluq_n_s16(d0_s16, 0);
+ d1_u16 = vqshluq_n_s16(d1_s16, 0);
+ d2_u16 = vqshluq_n_s16(d2_s16, 0);
+ d3_u16 = vqshluq_n_s16(d3_s16, 0);
+ d4_u16 = vqshluq_n_s16(d4_s16, 0);
+ d5_u16 = vqshluq_n_s16(d5_s16, 0);
+ d6_u16 = vqshluq_n_s16(d6_s16, 0);
+ d7_u16 = vqshluq_n_s16(d7_s16, 0);
+
+ vst1q_u16(dest, d0_u16);
+ dest += stride;
+ vst1q_u16(dest, d1_u16);
+ dest += stride;
+ vst1q_u16(dest, d2_u16);
+ dest += stride;
+ vst1q_u16(dest, d3_u16);
+ dest += stride;
+ vst1q_u16(dest, d4_u16);
+ dest += stride;
+ vst1q_u16(dest, d5_u16);
+ dest += stride;
+ vst1q_u16(dest, d6_u16);
+ dest += stride;
+ vst1q_u16(dest, d7_u16);
+}
+
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ int32x4_t a0 = vld1q_s32(input);
+ int32x4_t a1 = vld1q_s32(input + 8);
+ int32x4_t a2 = vld1q_s32(input + 16);
+ int32x4_t a3 = vld1q_s32(input + 24);
+ int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+
+ if (bd == 8) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x8_t cospisd = vaddq_s16(cospis, cospis);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24
+ const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28
+ int16x4_t b0 = vmovn_s32(a0);
+ int16x4_t b1 = vmovn_s32(a1);
+ int16x4_t b2 = vmovn_s32(a2);
+ int16x4_t b3 = vmovn_s32(a3);
+ int16x4_t b4, b5, b6, b7;
+
+ idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4,
+ &b5, &b6, &b7);
+ idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5,
+ b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7);
+ c0 = vrshrq_n_s16(c0, 5);
+ c1 = vrshrq_n_s16(c1, 5);
+ c2 = vrshrq_n_s16(c2, 5);
+ c3 = vrshrq_n_s16(c3, 5);
+ c4 = vrshrq_n_s16(c4, 5);
+ c5 = vrshrq_n_s16(c5, 5);
+ c6 = vrshrq_n_s16(c6, 5);
+ c7 = vrshrq_n_s16(c7, 5);
+ } else {
+ const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24
+ const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
+ int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15;
+
+ if (bd == 10) {
+ idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+ &a6, &a7);
+ idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
+ &a10, &a11);
+ idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
+ &a14, &a15);
+ } else {
+ idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+ &a6, &a7);
+ idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
+ &a10, &a11);
+ idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
+ &a14, &a15);
+ }
+ c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
+ c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
+ c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
+ c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
+ c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
+ c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
+ c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
+ c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+ }
+ highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
+}
+
+static INLINE void idct8x8_64_half1d_bd10(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x4_t step1[8], step2[8];
+
+ transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+ // stage 1
+ step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+ step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+ step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+ step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+
+ step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
+ step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
+ step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
+ step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
+
+ step1[4] = vrshrq_n_s32(step1[4], 14);
+ step1[5] = vrshrq_n_s32(step1[5], 14);
+ step1[6] = vrshrq_n_s32(step1[6], 14);
+ step1[7] = vrshrq_n_s32(step1[7], 14);
+
+ // stage 2
+ step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+ step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+ step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+
+ step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+ step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+ step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
+ step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
+
+ step2[0] = vrshrq_n_s32(step2[0], 14);
+ step2[1] = vrshrq_n_s32(step2[1], 14);
+ step2[2] = vrshrq_n_s32(step2[2], 14);
+ step2[3] = vrshrq_n_s32(step2[3], 14);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[0], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[0], step2[3]);
+
+ step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+ step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[5] = vrshrq_n_s32(step1[5], 14);
+ step1[6] = vrshrq_n_s32(step1[6], 14);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_half1d_bd12(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
+ input_7l, input_7h;
+ int32x2_t step1l[4], step1h[4];
+ int32x4_t step1[8], step2[8];
+ int64x2_t t64[8];
+ int32x2_t t32[8];
+
+ transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+ // stage 1
+ input_1l = vget_low_s32(*io1);
+ input_1h = vget_high_s32(*io1);
+ input_3l = vget_low_s32(*io3);
+ input_3h = vget_high_s32(*io3);
+ input_5l = vget_low_s32(*io5);
+ input_5h = vget_high_s32(*io5);
+ input_7l = vget_low_s32(*io7);
+ input_7h = vget_high_s32(*io7);
+ step1l[0] = vget_low_s32(*io0);
+ step1h[0] = vget_high_s32(*io0);
+ step1l[1] = vget_low_s32(*io2);
+ step1h[1] = vget_high_s32(*io2);
+ step1l[2] = vget_low_s32(*io4);
+ step1h[2] = vget_high_s32(*io4);
+ step1l[3] = vget_low_s32(*io6);
+ step1h[3] = vget_high_s32(*io6);
+
+ t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
+ t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
+ t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
+ t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
+ t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
+ t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
+ t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
+ t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
+ t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0);
+ t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0);
+ t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1);
+ t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1);
+ t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0);
+ t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
+ t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
+ t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
+ t32[0] = vrshrn_n_s64(t64[0], 14);
+ t32[1] = vrshrn_n_s64(t64[1], 14);
+ t32[2] = vrshrn_n_s64(t64[2], 14);
+ t32[3] = vrshrn_n_s64(t64[3], 14);
+ t32[4] = vrshrn_n_s64(t64[4], 14);
+ t32[5] = vrshrn_n_s64(t64[5], 14);
+ t32[6] = vrshrn_n_s64(t64[6], 14);
+ t32[7] = vrshrn_n_s64(t64[7], 14);
+ step1[4] = vcombine_s32(t32[0], t32[1]);
+ step1[5] = vcombine_s32(t32[2], t32[3]);
+ step1[6] = vcombine_s32(t32[4], t32[5]);
+ step1[7] = vcombine_s32(t32[6], t32[7]);
+
+ // stage 2
+ t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+ t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+ t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+ t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+ t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+ t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+ t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+ t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+ t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+ t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
+ t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
+ t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
+ t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
+ t32[0] = vrshrn_n_s64(t64[0], 14);
+ t32[1] = vrshrn_n_s64(t64[1], 14);
+ t32[2] = vrshrn_n_s64(t64[2], 14);
+ t32[3] = vrshrn_n_s64(t64[3], 14);
+ t32[4] = vrshrn_n_s64(t64[4], 14);
+ t32[5] = vrshrn_n_s64(t64[5], 14);
+ t32[6] = vrshrn_n_s64(t64[6], 14);
+ t32[7] = vrshrn_n_s64(t64[7], 14);
+ step2[0] = vcombine_s32(t32[0], t32[1]);
+ step2[1] = vcombine_s32(t32[2], t32[3]);
+ step2[2] = vcombine_s32(t32[4], t32[5]);
+ step2[3] = vcombine_s32(t32[6], t32[7]);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[0], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[0], step2[3]);
+
+ t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[0] =
+ vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t64[2] =
+ vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t32[0] = vrshrn_n_s64(t64[0], 14);
+ t32[1] = vrshrn_n_s64(t64[1], 14);
+ t32[2] = vrshrn_n_s64(t64[2], 14);
+ t32[3] = vrshrn_n_s64(t64[3], 14);
+ step1[5] = vcombine_s32(t32[0], t32[1]);
+ step1[6] = vcombine_s32(t32[2], t32[3]);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ int32x4_t a0 = vld1q_s32(input);
+ int32x4_t a1 = vld1q_s32(input + 4);
+ int32x4_t a2 = vld1q_s32(input + 8);
+ int32x4_t a3 = vld1q_s32(input + 12);
+ int32x4_t a4 = vld1q_s32(input + 16);
+ int32x4_t a5 = vld1q_s32(input + 20);
+ int32x4_t a6 = vld1q_s32(input + 24);
+ int32x4_t a7 = vld1q_s32(input + 28);
+ int32x4_t a8 = vld1q_s32(input + 32);
+ int32x4_t a9 = vld1q_s32(input + 36);
+ int32x4_t a10 = vld1q_s32(input + 40);
+ int32x4_t a11 = vld1q_s32(input + 44);
+ int32x4_t a12 = vld1q_s32(input + 48);
+ int32x4_t a13 = vld1q_s32(input + 52);
+ int32x4_t a14 = vld1q_s32(input + 56);
+ int32x4_t a15 = vld1q_s32(input + 60);
+ int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+
+ if (bd == 8) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
+ int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1));
+ int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3));
+ int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5));
+ int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7));
+ int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9));
+ int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11));
+ int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13));
+ int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15));
+
+ idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
+ idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
+
+ c0 = vrshrq_n_s16(b0, 5);
+ c1 = vrshrq_n_s16(b1, 5);
+ c2 = vrshrq_n_s16(b2, 5);
+ c3 = vrshrq_n_s16(b3, 5);
+ c4 = vrshrq_n_s16(b4, 5);
+ c5 = vrshrq_n_s16(b5, 5);
+ c6 = vrshrq_n_s16(b6, 5);
+ c7 = vrshrq_n_s16(b7, 5);
+ } else {
+ const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24
+ const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
+
+ if (bd == 10) {
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+ &a6, &a7);
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
+ &a14, &a15);
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
+ &a3, &a11);
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
+ &a7, &a15);
+ } else {
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+ &a6, &a7);
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
+ &a14, &a15);
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
+ &a3, &a11);
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
+ &a7, &a15);
+ }
+ c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
+ c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
+ c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
+ c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
+ c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
+ c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
+ c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
+ c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+ }
+ highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
new file mode 100644
index 00000000000..6f7e5da7627
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -0,0 +1,1078 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) {
+ const uint16x4_t ref_u16 = vld1_u16(ref);
+ const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16);
+ return vpadd_u16(p0, p0);
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ const uint16x4_t dc) {
+ const uint16x4_t dc_dup = vdup_lane_u16(dc, 0);
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ vst1_u16(dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t a = vld1_u16(above);
+ const uint16x4_t l = vld1_u16(left);
+ uint16x4_t sum;
+ uint16x4_t dc;
+ (void)bd;
+ sum = vadd_u16(a, l);
+ sum = vpadd_u16(sum, sum);
+ sum = vpadd_u16(sum, sum);
+ dc = vrshr_n_u16(sum, 3);
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_4(left);
+ const uint16x4_t dc = vrshr_n_u16(sum, 2);
+ (void)above;
+ (void)bd;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_4(above);
+ const uint16x4_t dc = vrshr_n_u16(sum, 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) {
+ const uint16x8_t ref_u16 = vld1q_u16(ref);
+ uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16));
+ sum = vpadd_u16(sum, sum);
+ return vpadd_u16(sum, sum);
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+ const uint16x4_t dc) {
+ const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ vst1q_u16(dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t above_u16 = vld1q_u16(above);
+ const uint16x8_t left_u16 = vld1q_u16(left);
+ const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
+ uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ uint16x4_t dc;
+ (void)bd;
+ sum = vpadd_u16(sum, sum);
+ sum = vpadd_u16(sum, sum);
+ dc = vrshr_n_u16(sum, 4);
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_8(left);
+ const uint16x4_t dc = vrshr_n_u16(sum, 3);
+ (void)above;
+ (void)bd;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_8(above);
+ const uint16x4_t dc = vrshr_n_u16(sum, 3);
+ (void)left;
+ (void)bd;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_8x8(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
+ const uint16x8x2_t ref_u16 = vld2q_u16(ref);
+ const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]);
+ uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ sum = vpadd_u16(sum, sum);
+ return vpadd_u16(sum, sum);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+ const uint16x4_t dc) {
+ uint16x8x2_t dc_dup;
+ int i;
+ dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
+ for (i = 0; i < 16; ++i, dst += stride) {
+ vst2q_u16(dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8x2_t a = vld2q_u16(above);
+ const uint16x8x2_t l = vld2q_u16(left);
+ const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]);
+ const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]);
+ const uint16x8_t pal0 = vaddq_u16(pa, pl);
+ uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
+ uint32x2_t sum;
+ uint16x4_t dc;
+ (void)bd;
+ pal1 = vpadd_u16(pal1, pal1);
+ sum = vpaddl_u16(pal1);
+ dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_16(left);
+ const uint16x4_t dc = vrshr_n_u16(sum, 4);
+ (void)above;
+ (void)bd;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_16(above);
+ const uint16x4_t dc = vrshr_n_u16(sum, 4);
+ (void)left;
+ (void)bd;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_16x16(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
+ const uint16x8x4_t r = vld4q_u16(ref);
+ const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]);
+ const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+ sum = vpadd_u16(sum, sum);
+ return vpaddl_u16(sum);
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+ const uint16x4_t dc) {
+ uint16x8x2_t dc_dup;
+ int i;
+ dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
+
+ for (i = 0; i < 32; ++i) {
+ vst2q_u16(dst, dc_dup);
+ dst += 16;
+ vst2q_u16(dst, dc_dup);
+ dst += stride - 16;
+ }
+}
+
+void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8x4_t a = vld4q_u16(above);
+ const uint16x8x4_t l = vld4q_u16(left);
+ const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]);
+ const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]);
+ const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]);
+ const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]);
+ const uint16x8_t pa = vaddq_u16(pa0, pa1);
+ const uint16x8_t pl = vaddq_u16(pl0, pl1);
+ const uint16x8_t pal0 = vaddq_u16(pa, pl);
+ const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
+ uint32x2_t sum = vpaddl_u16(pal1);
+ uint16x4_t dc;
+ (void)bd;
+ sum = vpadd_u32(sum, sum);
+ dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6));
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint32x2_t sum = dc_sum_32(left);
+ const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+ (void)above;
+ (void)bd;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint32x2_t sum = dc_sum_32(above);
+ const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+ (void)left;
+ (void)bd;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_32x32(dst, stride, dc);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t ABCDEFGH = vld1q_u16(above);
+ const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
+ const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
+ const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
+ const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
+ const uint16x4_t avg2_low = vget_low_u16(avg2);
+ const uint16x4_t avg2_high = vget_high_u16(avg2);
+ const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
+ const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
+ const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
+ (void)left;
+ (void)bd;
+ vst1_u16(dst, avg2_low);
+ dst += stride;
+ vst1_u16(dst, r1);
+ dst += stride;
+ vst1_u16(dst, r2);
+ dst += stride;
+ vst1_u16(dst, r3);
+ vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t above_right, uint16x8_t *row) {
+ *row = vextq_u16(*row, above_right, 1);
+ vst1q_u16(*dst, *row);
+ *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t A0 = vld1q_u16(above);
+ const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
+ const uint16x8_t A1 = vld1q_u16(above + 1);
+ const uint16x8_t A2 = vld1q_u16(above + 2);
+ const uint16x8_t avg1 = vhaddq_u16(A0, A2);
+ uint16x8_t row = vrhaddq_u16(avg1, A1);
+ (void)left;
+ (void)bd;
+
+ vst1q_u16(dst, row);
+ dst += stride;
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ vst1q_u16(dst, above_right);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t above_right, uint16x8_t *row_0,
+ uint16x8_t *row_1) {
+ *row_0 = vextq_u16(*row_0, *row_1, 1);
+ *row_1 = vextq_u16(*row_1, above_right, 1);
+ vst1q_u16(*dst, *row_0);
+ *dst += 8;
+ vst1q_u16(*dst, *row_1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t A0_0 = vld1q_u16(above);
+ const uint16x8_t A0_1 = vld1q_u16(above + 8);
+ const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
+ const uint16x8_t A1_0 = vld1q_u16(above + 1);
+ const uint16x8_t A1_1 = vld1q_u16(above + 9);
+ const uint16x8_t A2_0 = vld1q_u16(above + 2);
+ const uint16x8_t A2_1 = vld1q_u16(above + 10);
+ const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+ const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+ uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+ uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+ (void)left;
+ (void)bd;
+
+ vst1q_u16(dst, row_0);
+ vst1q_u16(dst + 8, row_1);
+ dst += stride;
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ vst1q_u16(dst, above_right);
+ vst1q_u16(dst + 8, above_right);
+}
+
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t A0_0 = vld1q_u16(above);
+ const uint16x8_t A0_1 = vld1q_u16(above + 8);
+ const uint16x8_t A0_2 = vld1q_u16(above + 16);
+ const uint16x8_t A0_3 = vld1q_u16(above + 24);
+ const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
+ const uint16x8_t A1_0 = vld1q_u16(above + 1);
+ const uint16x8_t A1_1 = vld1q_u16(above + 9);
+ const uint16x8_t A1_2 = vld1q_u16(above + 17);
+ const uint16x8_t A1_3 = vld1q_u16(above + 25);
+ const uint16x8_t A2_0 = vld1q_u16(above + 2);
+ const uint16x8_t A2_1 = vld1q_u16(above + 10);
+ const uint16x8_t A2_2 = vld1q_u16(above + 18);
+ const uint16x8_t A2_3 = vld1q_u16(above + 26);
+ const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+ const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+ const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
+ const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
+ uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+ uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+ uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
+ uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
+ int i;
+ (void)left;
+ (void)bd;
+
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+ dst += 8;
+ vst1q_u16(dst, row_2);
+ dst += 8;
+ vst1q_u16(dst, row_3);
+ dst += stride - 24;
+
+ for (i = 0; i < 30; ++i) {
+ row_0 = vextq_u16(row_0, row_1, 1);
+ row_1 = vextq_u16(row_1, row_2, 1);
+ row_2 = vextq_u16(row_2, row_3, 1);
+ row_3 = vextq_u16(row_3, above_right, 1);
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+ dst += 8;
+ vst1q_u16(dst, row_2);
+ dst += 8;
+ vst1q_u16(dst, row_3);
+ dst += stride - 24;
+ }
+
+ vst1q_u16(dst, above_right);
+ dst += 8;
+ vst1q_u16(dst, above_right);
+ dst += 8;
+ vst1q_u16(dst, above_right);
+ dst += 8;
+ vst1q_u16(dst, above_right);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t XA0123___ = vld1q_u16(above - 1);
+ const uint16x4_t L0123 = vld1_u16(left);
+ const uint16x4_t L3210 = vrev64_u16(L0123);
+ const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
+ const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
+ const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
+ const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
+ const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
+ const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
+ const uint16x4_t row_0 = vget_low_u16(avg2);
+ const uint16x4_t row_1 = vget_high_u16(avg2);
+ const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
+ const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
+ const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
+ (void)bd;
+ vst1_u16(dst, r0);
+ dst += stride;
+ vst1_u16(dst, r1);
+ dst += stride;
+ vst1_u16(dst, r2);
+ dst += stride;
+ vst1_u16(dst, row_0);
+}
+
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+ const uint16x8_t A01234567 = vld1q_u16(above);
+ const uint16x8_t A1234567_ = vld1q_u16(above + 1);
+ const uint16x8_t L01234567 = vld1q_u16(left);
+ const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+ const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+ const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+ const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+ const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
+ const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
+ const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
+ const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
+ const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
+ const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
+ const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
+ const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
+ const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
+ const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
+ const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
+ (void)bd;
+ vst1q_u16(dst, r0);
+ dst += stride;
+ vst1q_u16(dst, r1);
+ dst += stride;
+ vst1q_u16(dst, r2);
+ dst += stride;
+ vst1q_u16(dst, r3);
+ dst += stride;
+ vst1q_u16(dst, r4);
+ dst += stride;
+ vst1q_u16(dst, r5);
+ dst += stride;
+ vst1q_u16(dst, r6);
+ dst += stride;
+ vst1q_u16(dst, row_0);
+}
+
+static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row_0,
+ const uint16x8_t row_1) {
+ vst1q_u16(*dst, row_0);
+ *dst += 8;
+ vst1q_u16(*dst, row_1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t L01234567 = vld1q_u16(left);
+ const uint16x8_t L89abcdef = vld1q_u16(left + 8);
+ const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+ const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+ const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
+ const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
+ const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+ const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
+ const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
+ const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
+ const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
+
+ const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+ const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+ const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+ const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
+ const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
+
+ const uint16x8_t A01234567 = vld1q_u16(above);
+ const uint16x8_t A12345678 = vld1q_u16(above + 1);
+ const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
+ const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
+
+ const uint16x8_t A789abcde = vld1q_u16(above + 7);
+ const uint16x8_t A89abcdef = vld1q_u16(above + 8);
+ const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
+ const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
+ const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
+
+ const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
+ const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
+ const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
+ const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
+ const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
+ const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
+ const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
+ const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
+ const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
+ const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
+ const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
+ const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
+ const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
+ const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
+ const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
+ const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
+ const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
+ const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
+ const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
+ const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
+ const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
+ (void)bd;
+
+ d135_store_16(&dst, stride, r0_0, r0_1);
+ d135_store_16(&dst, stride, r1_0, r1_1);
+ d135_store_16(&dst, stride, r2_0, r2_1);
+ d135_store_16(&dst, stride, r3_0, r3_1);
+ d135_store_16(&dst, stride, r4_0, r4_1);
+ d135_store_16(&dst, stride, r5_0, r5_1);
+ d135_store_16(&dst, stride, r6_0, r6_1);
+ d135_store_16(&dst, stride, row_1, row_2);
+ d135_store_16(&dst, stride, r8_0, r0_0);
+ d135_store_16(&dst, stride, r9_0, r1_0);
+ d135_store_16(&dst, stride, ra_0, r2_0);
+ d135_store_16(&dst, stride, rb_0, r3_0);
+ d135_store_16(&dst, stride, rc_0, r4_0);
+ d135_store_16(&dst, stride, rd_0, r5_0);
+ d135_store_16(&dst, stride, re_0, r6_0);
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+}
+
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t LL01234567 = vld1q_u16(left + 16);
+ const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
+ const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
+ const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
+ const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
+ const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
+ const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
+ const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
+ const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
+ const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
+ uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
+
+ const uint16x8_t LU01234567 = vld1q_u16(left);
+ const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
+ const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
+ const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
+ const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
+ const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
+ const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
+ const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
+ const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
+ const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
+ const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
+ uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
+
+ const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
+ const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
+ const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
+ uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
+
+ const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
+ const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
+ const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
+ const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
+ uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
+
+ const uint16x8_t AL01234567 = vld1q_u16(above);
+ const uint16x8_t AL12345678 = vld1q_u16(above + 1);
+ const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
+ uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
+
+ const uint16x8_t AL789abcde = vld1q_u16(above + 7);
+ const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
+ const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
+ const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
+ uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
+
+ const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
+ const uint16x8_t AR01234567 = vld1q_u16(above + 16);
+ const uint16x8_t AR12345678 = vld1q_u16(above + 17);
+ const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
+ uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
+
+ const uint16x8_t AR789abcde = vld1q_u16(above + 23);
+ const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
+ const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
+ const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
+ uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
+ int i, j;
+ (void)bd;
+
+ dst += 31 * stride;
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 8; ++j) {
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+ dst += 8;
+ vst1q_u16(dst, row_2);
+ dst += 8;
+ vst1q_u16(dst, row_3);
+ dst -= stride + 24;
+ row_0 = vextq_u16(row_0, row_1, 1);
+ row_1 = vextq_u16(row_1, row_2, 1);
+ row_2 = vextq_u16(row_2, row_3, 1);
+ row_3 = vextq_u16(row_3, row_4, 1);
+ row_4 = vextq_u16(row_4, row_4, 1);
+ }
+ row_4 = row_5;
+ row_5 = row_6;
+ row_6 = row_7;
+ }
+}
+
+//------------------------------------------------------------------------------
+
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t row = vld1_u16(above);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, dst += stride) {
+ vst1_u16(dst, row);
+ }
+}
+
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t row = vld1q_u16(above);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 8; i++, dst += stride) {
+ vst1q_u16(dst, row);
+ }
+}
+
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8x2_t row = vld2q_u16(above);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 16; i++, dst += stride) {
+ vst2q_u16(dst, row);
+ }
+}
+
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8x2_t row0 = vld2q_u16(above);
+ const uint16x8x2_t row1 = vld2q_u16(above + 16);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 32; i++) {
+ vst2q_u16(dst, row0);
+ dst += 16;
+ vst2q_u16(dst, row1);
+ dst += stride - 16;
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t left_u16 = vld1_u16(left);
+ uint16x4_t row;
+ (void)above;
+ (void)bd;
+
+ row = vdup_lane_u16(left_u16, 0);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 1);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 2);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 3);
+ vst1_u16(dst, row);
+}
+
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t left_u16 = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16);
+ const uint16x4_t left_high = vget_high_u16(left_u16);
+ uint16x8_t row;
+ (void)above;
+ (void)bd;
+
+ row = vdupq_lane_u16(left_low, 0);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 1);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 2);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 3);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 0);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 1);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 2);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 3);
+ vst1q_u16(dst, row);
+}
+
+static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row) {
+ // Note: vst1q is faster than vst2q
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ const uint16x8_t left_u16q = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16q);
+ const uint16x4_t left_high = vget_high_u16(left_u16q);
+ uint16x8_t row;
+
+ row = vdupq_lane_u16(left_low, 0);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 1);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 2);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 3);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 0);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 1);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 2);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 3);
+ h_store_16(&dst, stride, row);
+ }
+}
+
+static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row) {
+ // Note: vst1q is faster than vst2q
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += stride - 24;
+}
+
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ const uint16x8_t left_u16q = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16q);
+ const uint16x4_t left_high = vget_high_u16(left_u16q);
+ uint16x8_t row;
+
+ row = vdupq_lane_u16(left_low, 0);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 1);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 2);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 3);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 0);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 1);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 2);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 3);
+ h_store_32(&dst, stride, row);
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
+ const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
+ const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
+ const int16x8_t sub = vsubq_s16(above_s16, top_left);
+ int16x8_t sum;
+ uint16x8_t row;
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+ sum = vaddq_s16(sum, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1_u16(dst, vget_low_u16(row));
+ dst += stride;
+ vst1_u16(dst, vget_high_u16(row));
+ dst += stride;
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+ sum = vaddq_s16(sum, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1_u16(dst, vget_low_u16(row));
+ dst += stride;
+ vst1_u16(dst, vget_high_u16(row));
+}
+
+static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub,
+ const int16x8_t max) {
+ uint16x8_t row;
+ int16x8_t sum = vaddq_s16(left_dup, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1q_u16(*dst, row);
+ *dst += stride;
+}
+
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
+ const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
+ const int16x8_t sub = vsubq_s16(above_s16, top_left);
+ int16x4_t left_s16d;
+ int16x8_t left_dup;
+ int i;
+
+ left_s16d = vget_low_s16(left_s16);
+
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+ }
+}
+
+static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1, const int16x8_t max) {
+ uint16x8_t row0, row1;
+ int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ sum0 = vminq_s16(sum0, max);
+ sum1 = vminq_s16(sum1, max);
+ row0 = vqshluq_n_s16(sum0, 0);
+ row1 = vqshluq_n_s16(sum1, 0);
+ vst1q_u16(*dst, row0);
+ *dst += 8;
+ vst1q_u16(*dst, row1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+ const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+ const int16x8_t sub0 = vsubq_s16(above0, top_left);
+ const int16x8_t sub1 = vsubq_s16(above1, top_left);
+ int16x8_t left_dup;
+ int i, j;
+
+ for (j = 0; j < 2; j++, left += 8) {
+ const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+ }
+ }
+}
+
+static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1, const int16x8_t sub2,
+ const int16x8_t sub3, const int16x8_t max) {
+ uint16x8_t row0, row1, row2, row3;
+ int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+ int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+ sum0 = vminq_s16(sum0, max);
+ sum1 = vminq_s16(sum1, max);
+ sum2 = vminq_s16(sum2, max);
+ sum3 = vminq_s16(sum3, max);
+ row0 = vqshluq_n_s16(sum0, 0);
+ row1 = vqshluq_n_s16(sum1, 0);
+ row2 = vqshluq_n_s16(sum2, 0);
+ row3 = vqshluq_n_s16(sum3, 0);
+ vst1q_u16(*dst, row0);
+ *dst += 8;
+ vst1q_u16(*dst, row1);
+ *dst += 8;
+ vst1q_u16(*dst, row2);
+ *dst += 8;
+ vst1q_u16(*dst, row3);
+ *dst += stride - 24;
+}
+
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+ const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+ const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
+ const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
+ const int16x8_t sub0 = vsubq_s16(above0, top_left);
+ const int16x8_t sub1 = vsubq_s16(above1, top_left);
+ const int16x8_t sub2 = vsubq_s16(above2, top_left);
+ const int16x8_t sub3 = vsubq_s16(above3, top_left);
+ int16x8_t left_dup;
+ int i, j;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
index e3c0c5210d2..d648840df40 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
@@ -15,12 +15,11 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride)
+;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct16x16_1_add_neon| PROC
ldrsh r0, [r0]
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
index f1e49ff5178..968bc5cc3ab 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -11,49 +11,66 @@
#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
- uint8x8_t d2u8, d3u8, d30u8, d31u8;
- uint64x1_t d2u64, d3u64, d4u64, d5u64;
- uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
- int16x8_t q0s16;
- uint8_t *d1, *d2;
- int16_t i, j, a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- q0s16 = vdupq_n_s16(a1);
- q0u16 = vreinterpretq_u16_s16(q0s16);
-
- for (d1 = d2 = dest, i = 0; i < 4; i++) {
- for (j = 0; j < 2; j++) {
- d2u64 = vld1_u64((const uint64_t *)d1);
- d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
- d1 += dest_stride;
- d4u64 = vld1_u64((const uint64_t *)d1);
- d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
- d1 += dest_stride;
+static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a = vld1q_u8(*dest);
+ const uint8x16_t b = vqaddq_u8(a, res);
+ vst1q_u8(*dest, b);
+ *dest += stride;
+}
- q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
- q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
- q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
- q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a = vld1q_u8(*dest);
+ const uint8x16_t b = vqsubq_u8(a, res);
+ vst1q_u8(*dest, b);
+ *dest += stride;
+}
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
- d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
- vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
- d2 += dest_stride;
- }
+ if (a1 >= 0) {
+ const uint8x16_t dc = create_dcq(a1);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ } else {
+ const uint8x16_t dc = create_dcq(-a1);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
index 5e64cea0ae7..ea6b099d3bb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
@@ -8,8 +8,14 @@
; be found in the AUTHORS file in the root of the source tree.
;
+ INCLUDE vpx_dsp/arm/idct_neon.asm.S
+
EXPORT |vpx_idct16x16_256_add_neon_pass1|
EXPORT |vpx_idct16x16_256_add_neon_pass2|
+ IF CONFIG_VP9_HIGHBITDEPTH
+ EXPORT |vpx_idct16x16_256_add_neon_pass1_tran_low|
+ EXPORT |vpx_idct16x16_256_add_neon_pass2_tran_low|
+ ENDIF
EXPORT |vpx_idct16x16_10_add_neon_pass1|
EXPORT |vpx_idct16x16_10_add_neon_pass2|
ARM
@@ -36,12 +42,10 @@
MEND
AREA Block, CODE, READONLY ; name this block of code
-;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input,
-; int16_t *output, int output_stride)
+;void |vpx_idct16x16_256_add_neon_pass1|(const int16_t *input, int16_t *output)
;
-; r0 int16_t input
+; r0 const int16_t *input
; r1 int16_t *output
-; r2 int output_stride)
; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
@@ -60,6 +64,7 @@
vld2.s16 {q1,q2}, [r0]!
vmov.s16 q15, q1
+idct16x16_256_add_neon_pass1
; cospi_28_64 = 3196
movw r3, #0x0c7c
@@ -100,12 +105,12 @@
vdup.16 d3, r12 ; duplicate cospi_20_64
; dct_const_round_shift(temp1)
- vqrshrn.s32 d8, q2, #14 ; >> 14
- vqrshrn.s32 d9, q3, #14 ; >> 14
+ vrshrn.s32 d8, q2, #14 ; >> 14
+ vrshrn.s32 d9, q3, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d14, q5, #14 ; >> 14
- vqrshrn.s32 d15, q6, #14 ; >> 14
+ vrshrn.s32 d14, q5, #14 ; >> 14
+ vrshrn.s32 d15, q6, #14 ; >> 14
; preloading to avoid stall
; cospi_16_64 = 11585
@@ -131,12 +136,12 @@
vmlal.s16 q15, d23, d2
; dct_const_round_shift(temp1)
- vqrshrn.s32 d10, q2, #14 ; >> 14
- vqrshrn.s32 d11, q3, #14 ; >> 14
+ vrshrn.s32 d10, q2, #14 ; >> 14
+ vrshrn.s32 d11, q3, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q15, #14 ; >> 14
+ vrshrn.s32 d12, q9, #14 ; >> 14
+ vrshrn.s32 d13, q15, #14 ; >> 14
; stage 4
vdup.16 d30, r3 ; cospi_16_64
@@ -164,12 +169,12 @@
vsub.s32 q1, q11, q1
; dct_const_round_shift(temp1)
- vqrshrn.s32 d16, q3, #14 ; >> 14
- vqrshrn.s32 d17, q12, #14 ; >> 14
+ vrshrn.s32 d16, q3, #14 ; >> 14
+ vrshrn.s32 d17, q12, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d18, q13, #14 ; >> 14
- vqrshrn.s32 d19, q1, #14 ; >> 14
+ vrshrn.s32 d18, q13, #14 ; >> 14
+ vrshrn.s32 d19, q1, #14 ; >> 14
; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
; step1[2] * cospi_8_64
@@ -189,12 +194,12 @@
vmlsl.s16 q13, d29, d31
; dct_const_round_shift(temp2)
- vqrshrn.s32 d22, q0, #14 ; >> 14
- vqrshrn.s32 d23, q1, #14 ; >> 14
+ vrshrn.s32 d22, q0, #14 ; >> 14
+ vrshrn.s32 d23, q1, #14 ; >> 14
; dct_const_round_shift(temp1)
- vqrshrn.s32 d20, q12, #14 ; >> 14
- vqrshrn.s32 d21, q13, #14 ; >> 14
+ vrshrn.s32 d20, q12, #14 ; >> 14
+ vrshrn.s32 d21, q13, #14 ; >> 14
vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5];
vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5];
@@ -229,15 +234,15 @@
vadd.s32 q10, q10, q12
; dct_const_round_shift(temp1)
- vqrshrn.s32 d10, q6, #14 ; >> 14
- vqrshrn.s32 d11, q13, #14 ; >> 14
+ vrshrn.s32 d10, q6, #14 ; >> 14
+ vrshrn.s32 d11, q13, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q10, #14 ; >> 14
+ vrshrn.s32 d12, q9, #14 ; >> 14
+ vrshrn.s32 d13, q10, #14 ; >> 14
; stage 6
- vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7];
+ vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7];
vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6];
vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5];
vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4];
@@ -247,46 +252,54 @@
vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7];
; store the data
- vst1.64 {d16}, [r1], r2
- vst1.64 {d17}, [r1], r2
- vst1.64 {d18}, [r1], r2
- vst1.64 {d19}, [r1], r2
- vst1.64 {d20}, [r1], r2
- vst1.64 {d21}, [r1], r2
- vst1.64 {d22}, [r1], r2
- vst1.64 {d23}, [r1], r2
- vst1.64 {d24}, [r1], r2
- vst1.64 {d25}, [r1], r2
- vst1.64 {d26}, [r1], r2
- vst1.64 {d27}, [r1], r2
- vst1.64 {d28}, [r1], r2
- vst1.64 {d29}, [r1], r2
- vst1.64 {d30}, [r1], r2
- vst1.64 {d31}, [r1], r2
+ vst1.64 {q8-q9}, [r1]!
+ vst1.64 {q10-q11}, [r1]!
+ vst1.64 {q12-q13}, [r1]!
+ vst1.64 {q14-q15}, [r1]
bx lr
ENDP ; |vpx_idct16x16_256_add_neon_pass1|
-;void vpx_idct16x16_256_add_neon_pass2(int16_t *src,
-; int16_t *output,
-; int16_t *pass1Output,
-; int16_t skip_adding,
-; uint8_t *dest,
-; int dest_stride)
+ IF CONFIG_VP9_HIGHBITDEPTH
+;void |vpx_idct16x16_256_add_neon_pass1_tran_low|(const tran_low_t *input,
+; int16_t *output)
+;
+; r0 const tran_low_t *input
+; r1 int16_t *output
+
+|vpx_idct16x16_256_add_neon_pass1_tran_low| PROC
+ LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
+ LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
+ LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
+ LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
+ LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
+ LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
+ LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
+ LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0
+ vmov.s16 q15, q1
+
+ b idct16x16_256_add_neon_pass1
+ ENDP ; |vpx_idct16x16_256_add_neon_pass1_tran_low|
+ ENDIF ; CONFIG_VP9_HIGHBITDEPTH
+
+;void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
+; int16_t *output,
+; int16_t *pass1_output,
+; int16_t skip_adding,
+; uint8_t *dest,
+; int stride)
;
-; r0 int16_t *src
-; r1 int16_t *output,
-; r2 int16_t *pass1Output,
-; r3 int16_t skip_adding,
-; r4 uint8_t *dest,
-; r5 int dest_stride)
+; r0 const int16_t *src
+; r1 int16_t *output
+; r2 int16_t *pass1_output
+; r3 int16_t skip_adding
+; r4 uint8_t *dest
+; r5 int stride
; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
; registers and use them as buffer during calculation.
|vpx_idct16x16_256_add_neon_pass2| PROC
- push {r3-r9}
-
; TODO(hkuang): Find a better way to load the elements.
; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
vld2.s16 {q8,q9}, [r0]!
@@ -299,6 +312,9 @@
vld2.s16 {q0,q1}, [r0]!
vmov.s16 q15, q0;
+idct16x16_256_add_neon_pass2
+ push {r3-r9}
+
; cospi_30_64 = 1606
movw r3, #0x0646
@@ -339,12 +355,12 @@
vdup.16 d31, r12 ; duplicate cospi_18_64
; dct_const_round_shift(temp1)
- vqrshrn.s32 d0, q2, #14 ; >> 14
- vqrshrn.s32 d1, q3, #14 ; >> 14
+ vrshrn.s32 d0, q2, #14 ; >> 14
+ vrshrn.s32 d1, q3, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d14, q1, #14 ; >> 14
- vqrshrn.s32 d15, q4, #14 ; >> 14
+ vrshrn.s32 d14, q1, #14 ; >> 14
+ vrshrn.s32 d15, q4, #14 ; >> 14
; preloading to avoid stall
; cospi_22_64 = 7723
@@ -373,12 +389,12 @@
vdup.16 d31, r12 ; duplicate cospi_10_64
; dct_const_round_shift(temp1)
- vqrshrn.s32 d2, q2, #14 ; >> 14
- vqrshrn.s32 d3, q3, #14 ; >> 14
+ vrshrn.s32 d2, q2, #14 ; >> 14
+ vrshrn.s32 d3, q3, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q4, #14 ; >> 14
- vqrshrn.s32 d13, q5, #14 ; >> 14
+ vrshrn.s32 d12, q4, #14 ; >> 14
+ vrshrn.s32 d13, q5, #14 ; >> 14
; step1[10] * cospi_22_64
vmull.s16 q11, d20, d30
@@ -407,12 +423,12 @@
vdup.16 d31, r12 ; duplicate cospi_26_64
; dct_const_round_shift(temp1)
- vqrshrn.s32 d4, q11, #14 ; >> 14
- vqrshrn.s32 d5, q12, #14 ; >> 14
+ vrshrn.s32 d4, q11, #14 ; >> 14
+ vrshrn.s32 d5, q12, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d11, q5, #14 ; >> 14
- vqrshrn.s32 d10, q4, #14 ; >> 14
+ vrshrn.s32 d11, q5, #14 ; >> 14
+ vrshrn.s32 d10, q4, #14 ; >> 14
; step1[11] * cospi_6_64
vmull.s16 q10, d28, d30
@@ -434,12 +450,12 @@
vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9]
; dct_const_round_shift(temp1)
- vqrshrn.s32 d6, q10, #14 ; >> 14
- vqrshrn.s32 d7, q11, #14 ; >> 14
+ vrshrn.s32 d6, q10, #14 ; >> 14
+ vrshrn.s32 d7, q11, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d8, q12, #14 ; >> 14
- vqrshrn.s32 d9, q13, #14 ; >> 14
+ vrshrn.s32 d8, q12, #14 ; >> 14
+ vrshrn.s32 d9, q13, #14 ; >> 14
; stage 3
vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11]
@@ -480,12 +496,12 @@
vdup.16 d30, r12 ; duplicate -cospi_8_64
; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q2, #14 ; >> 14
- vqrshrn.s32 d13, q3, #14 ; >> 14
+ vrshrn.s32 d12, q2, #14 ; >> 14
+ vrshrn.s32 d13, q3, #14 ; >> 14
; dct_const_round_shift(temp1)
- vqrshrn.s32 d2, q4, #14 ; >> 14
- vqrshrn.s32 d3, q5, #14 ; >> 14
+ vrshrn.s32 d2, q4, #14 ; >> 14
+ vrshrn.s32 d3, q5, #14 ; >> 14
vmov.s16 q3, q11
vmov.s16 q4, q12
@@ -507,12 +523,12 @@
vmlal.s16 q9, d27, d31
; dct_const_round_shift(temp2)
- vqrshrn.s32 d4, q11, #14 ; >> 14
- vqrshrn.s32 d5, q12, #14 ; >> 14
+ vrshrn.s32 d4, q11, #14 ; >> 14
+ vrshrn.s32 d5, q12, #14 ; >> 14
; dct_const_round_shift(temp1)
- vqrshrn.s32 d10, q8, #14 ; >> 14
- vqrshrn.s32 d11, q9, #14 ; >> 14
+ vrshrn.s32 d10, q8, #14 ; >> 14
+ vrshrn.s32 d11, q9, #14 ; >> 14
; stage 5
vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11];
@@ -547,12 +563,12 @@
vadd.s32 q4, q4, q1
; dct_const_round_shift(temp1)
- vqrshrn.s32 d4, q5, #14 ; >> 14
- vqrshrn.s32 d5, q6, #14 ; >> 14
+ vrshrn.s32 d4, q5, #14 ; >> 14
+ vrshrn.s32 d5, q6, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d10, q10, #14 ; >> 14
- vqrshrn.s32 d11, q4, #14 ; >> 14
+ vrshrn.s32 d10, q10, #14 ; >> 14
+ vrshrn.s32 d11, q4, #14 ; >> 14
; step1[11] * cospi_16_64
vmull.s16 q0, d22, d14
@@ -571,21 +587,21 @@
vadd.s32 q6, q6, q1
; dct_const_round_shift(temp1)
- vqrshrn.s32 d6, q10, #14 ; >> 14
- vqrshrn.s32 d7, q4, #14 ; >> 14
+ vrshrn.s32 d6, q10, #14 ; >> 14
+ vrshrn.s32 d7, q4, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d8, q13, #14 ; >> 14
- vqrshrn.s32 d9, q6, #14 ; >> 14
+ vrshrn.s32 d8, q13, #14 ; >> 14
+ vrshrn.s32 d9, q6, #14 ; >> 14
- mov r4, #16 ; pass1Output stride
+ mov r4, #16 ; pass1_output stride
ldr r3, [sp] ; load skip_adding
cmp r3, #0 ; check if need adding dest data
beq skip_adding_dest
ldr r7, [sp, #28] ; dest used to save element 0-7
mov r9, r7 ; save dest pointer for later use
- ldr r8, [sp, #32] ; load dest_stride
+ ldr r8, [sp, #32] ; load stride
; stage 7
; load the data in pass1
@@ -599,8 +615,8 @@
vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
vqmovun.s16 d12, q12 ; clip pixel
vqmovun.s16 d13, q13 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
@@ -613,8 +629,8 @@
vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
vqmovun.s16 d12, q12 ; clip pixel
vqmovun.s16 d13, q13 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
@@ -631,8 +647,8 @@
vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
vqmovun.s16 d12, q12 ; clip pixel
vqmovun.s16 d13, q13 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
@@ -645,8 +661,8 @@
vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
+ vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
vqmovun.s16 d12, q12 ; clip pixel
vqmovun.s16 d13, q13 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
@@ -658,42 +674,42 @@
; store the data output 8,9,10,11,12,13,14,15
vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d12 ; + dest[j * stride + i]
vqmovun.s16 d12, q8 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
vld1.64 {d12}, [r7], r8 ; load destinatoin data
vrshr.s16 q9, q9, #6
- vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q9, q9, d13 ; + dest[j * stride + i]
vqmovun.s16 d13, q9 ; clip pixel
vst1.64 {d13}, [r9], r8 ; store the data
vld1.64 {d13}, [r7], r8 ; load destinatoin data
vrshr.s16 q2, q2, #6
- vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q2, q2, d12 ; + dest[j * stride + i]
vqmovun.s16 d12, q2 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
vld1.64 {d12}, [r7], r8 ; load destinatoin data
vrshr.s16 q3, q3, #6
- vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q3, q3, d13 ; + dest[j * stride + i]
vqmovun.s16 d13, q3 ; clip pixel
vst1.64 {d13}, [r9], r8 ; store the data
vld1.64 {d13}, [r7], r8 ; load destinatoin data
vrshr.s16 q4, q4, #6
- vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q4, q4, d12 ; + dest[j * stride + i]
vqmovun.s16 d12, q4 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
vld1.64 {d12}, [r7], r8 ; load destinatoin data
vrshr.s16 q5, q5, #6
- vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q5, q5, d13 ; + dest[j * stride + i]
vqmovun.s16 d13, q5 ; clip pixel
vst1.64 {d13}, [r9], r8 ; store the data
vld1.64 {d13}, [r7], r8 ; load destinatoin data
vrshr.s16 q14, q14, #6
- vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i]
+ vaddw.u8 q14, q14, d12 ; + dest[j * stride + i]
vqmovun.s16 d12, q14 ; clip pixel
vst1.64 {d12}, [r9], r8 ; store the data
vld1.64 {d12}, [r7], r8 ; load destinatoin data
vrshr.s16 q15, q15, #6
- vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i]
+ vaddw.u8 q15, q15, d13 ; + dest[j * stride + i]
vqmovun.s16 d13, q15 ; clip pixel
vst1.64 {d13}, [r9], r8 ; store the data
b end_idct16x16_pass2
@@ -767,12 +783,41 @@ end_idct16x16_pass2
bx lr
ENDP ; |vpx_idct16x16_256_add_neon_pass2|
-;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input,
-; int16_t *output, int output_stride)
+ IF CONFIG_VP9_HIGHBITDEPTH
+;void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
+; int16_t *output,
+; int16_t *pass1_output,
+; int16_t skip_adding,
+; uint8_t *dest,
+; int stride)
;
-; r0 int16_t input
+; r0 const tran_low_t *src
+; r1 int16_t *output
+; r2 int16_t *pass1_output
+; r3 int16_t skip_adding
+; r4 uint8_t *dest
+; r5 int stride
+
+|vpx_idct16x16_256_add_neon_pass2_tran_low| PROC
+ LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
+ LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
+ LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
+ LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
+ LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
+ LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
+ LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
+ LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0
+ vmov.s16 q15, q0
+
+ b idct16x16_256_add_neon_pass2
+ ENDP ; |vpx_idct16x16_256_add_neon_pass2_tran_low|
+ ENDIF ; CONFIG_VP9_HIGHBITDEPTH
+
+;void |vpx_idct16x16_10_add_neon_pass1|(const tran_low_t *input,
+; int16_t *output)
+;
+; r0 const tran_low_t *input
; r1 int16_t *output
-; r2 int output_stride)
; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
@@ -781,14 +826,14 @@ end_idct16x16_pass2
; TODO(hkuang): Find a better way to load the elements.
; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
- vld2.s16 {q8,q9}, [r0]!
- vld2.s16 {q9,q10}, [r0]!
- vld2.s16 {q10,q11}, [r0]!
- vld2.s16 {q11,q12}, [r0]!
- vld2.s16 {q12,q13}, [r0]!
- vld2.s16 {q13,q14}, [r0]!
- vld2.s16 {q14,q15}, [r0]!
- vld2.s16 {q1,q2}, [r0]!
+ LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
+ LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
+ LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
+ LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
+ LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
+ LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
+ LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
+ LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0
vmov.s16 q15, q1
; cospi_28_64*2 = 6392
@@ -846,12 +891,12 @@ end_idct16x16_pass2
vadd.s32 q10, q10, q12
; dct_const_round_shift(temp1)
- vqrshrn.s32 d11, q15, #14 ; >> 14
- vqrshrn.s32 d10, q6, #14 ; >> 14
+ vrshrn.s32 d11, q15, #14 ; >> 14
+ vrshrn.s32 d10, q6, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q10, #14 ; >> 14
+ vrshrn.s32 d12, q9, #14 ; >> 14
+ vrshrn.s32 d13, q10, #14 ; >> 14
; stage 6
vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7];
@@ -864,39 +909,21 @@ end_idct16x16_pass2
vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7];
; store the data
- vst1.64 {d4}, [r1], r2
- vst1.64 {d5}, [r1], r2
- vst1.64 {d18}, [r1], r2
- vst1.64 {d19}, [r1], r2
- vst1.64 {d20}, [r1], r2
- vst1.64 {d21}, [r1], r2
- vst1.64 {d22}, [r1], r2
- vst1.64 {d23}, [r1], r2
- vst1.64 {d24}, [r1], r2
- vst1.64 {d25}, [r1], r2
- vst1.64 {d26}, [r1], r2
- vst1.64 {d27}, [r1], r2
- vst1.64 {d28}, [r1], r2
- vst1.64 {d29}, [r1], r2
- vst1.64 {d30}, [r1], r2
- vst1.64 {d31}, [r1], r2
+ vst1.64 {q2}, [r1]!
+ vst1.64 {q9-q10}, [r1]!
+ vst1.64 {q11-q12}, [r1]!
+ vst1.64 {q13-q14}, [r1]!
+ vst1.64 {q15}, [r1]
bx lr
ENDP ; |vpx_idct16x16_10_add_neon_pass1|
-;void vpx_idct16x16_10_add_neon_pass2(int16_t *src,
-; int16_t *output,
-; int16_t *pass1Output,
-; int16_t skip_adding,
-; uint8_t *dest,
-; int dest_stride)
+;void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output,
+; int16_t *pass1_output)
;
-; r0 int16_t *src
-; r1 int16_t *output,
-; r2 int16_t *pass1Output,
-; r3 int16_t skip_adding,
-; r4 uint8_t *dest,
-; r5 int dest_stride)
+; r0 const tran_low_t *src
+; r1 int16_t *output
+; r2 int16_t *pass1_output
; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
@@ -906,14 +933,14 @@ end_idct16x16_pass2
; TODO(hkuang): Find a better way to load the elements.
; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
- vld2.s16 {q8,q9}, [r0]!
- vld2.s16 {q9,q10}, [r0]!
- vld2.s16 {q10,q11}, [r0]!
- vld2.s16 {q11,q12}, [r0]!
- vld2.s16 {q12,q13}, [r0]!
- vld2.s16 {q13,q14}, [r0]!
- vld2.s16 {q14,q15}, [r0]!
- vld2.s16 {q0,q1}, [r0]!
+ LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
+ LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
+ LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
+ LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
+ LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
+ LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
+ LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
+ LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0
vmov.s16 q15, q0;
; 2*cospi_30_64 = 3212
@@ -981,12 +1008,12 @@ end_idct16x16_pass2
vdup.16 d30, r12 ; duplicate -cospi_8_64
; dct_const_round_shift(temp1)
- vqrshrn.s32 d2, q12, #14 ; >> 14
- vqrshrn.s32 d3, q5, #14 ; >> 14
+ vrshrn.s32 d2, q12, #14 ; >> 14
+ vrshrn.s32 d3, q5, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d12, q2, #14 ; >> 14
- vqrshrn.s32 d13, q11, #14 ; >> 14
+ vrshrn.s32 d12, q2, #14 ; >> 14
+ vrshrn.s32 d13, q11, #14 ; >> 14
; - step1[13] * cospi_8_64
vmull.s16 q10, d8, d30
@@ -1005,12 +1032,12 @@ end_idct16x16_pass2
vmlal.s16 q9, d9, d31
; dct_const_round_shift(temp1)
- vqrshrn.s32 d4, q10, #14 ; >> 14
- vqrshrn.s32 d5, q13, #14 ; >> 14
+ vrshrn.s32 d4, q10, #14 ; >> 14
+ vrshrn.s32 d5, q13, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d10, q8, #14 ; >> 14
- vqrshrn.s32 d11, q9, #14 ; >> 14
+ vrshrn.s32 d10, q8, #14 ; >> 14
+ vrshrn.s32 d11, q9, #14 ; >> 14
; stage 5
vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11];
@@ -1045,12 +1072,12 @@ end_idct16x16_pass2
vadd.s32 q1, q4, q1
; dct_const_round_shift(temp1)
- vqrshrn.s32 d4, q5, #14 ; >> 14
- vqrshrn.s32 d5, q6, #14 ; >> 14
+ vrshrn.s32 d4, q5, #14 ; >> 14
+ vrshrn.s32 d5, q6, #14 ; >> 14
; dct_const_round_shift(temp2)
- vqrshrn.s32 d10, q0, #14 ; >> 14
- vqrshrn.s32 d11, q1, #14 ; >> 14
+ vrshrn.s32 d10, q0, #14 ; >> 14
+ vrshrn.s32 d11, q1, #14 ; >> 14
; step1[11] * cospi_16_64
vmull.s16 q0, d22, d14
@@ -1069,14 +1096,14 @@ end_idct16x16_pass2
vadd.s32 q6, q6, q1
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d6, q10, #14 ; >> 14
- vqrshrn.s32 d7, q4, #14 ; >> 14
+ vrshrn.s32 d6, q10, #14 ; >> 14
+ vrshrn.s32 d7, q4, #14 ; >> 14
; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
- vqrshrn.s32 d8, q13, #14 ; >> 14
- vqrshrn.s32 d9, q6, #14 ; >> 14
+ vrshrn.s32 d8, q13, #14 ; >> 14
+ vrshrn.s32 d9, q6, #14 ; >> 14
- mov r4, #16 ; pass1Output stride
+ mov r4, #16 ; pass1_output stride
ldr r3, [sp] ; load skip_adding
; stage 7
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
index f682afc7bf6..0c891919b76 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -10,1218 +10,813 @@
#include <arm_neon.h>
-#include "./vpx_config.h"
-#include "vpx_dsp/arm/transpose_neon.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/txfm_common.h"
-void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
- int output_stride) {
- int16x4_t d0s16, d1s16, d2s16, d3s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
- uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
- int16x8x2_t q0x2s16;
-
- q0x2s16 = vld2q_s16(in);
- q8s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q9s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q10s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q11s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q12s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q13s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q14s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q15s16 = q0x2s16.val[0];
-
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
- d30s16 = vget_low_s16(q15s16);
- d31s16 = vget_high_s16(q15s16);
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void idct16x16_256_add_load_tran_low_kernel(
+ const tran_low_t **input, int16_t **out) {
+ int16x8_t s;
- // stage 3
- d0s16 = vdup_n_s16((int16_t)cospi_28_64);
- d1s16 = vdup_n_s16((int16_t)cospi_4_64);
-
- q2s32 = vmull_s16(d18s16, d0s16);
- q3s32 = vmull_s16(d19s16, d0s16);
- q5s32 = vmull_s16(d18s16, d1s16);
- q6s32 = vmull_s16(d19s16, d1s16);
-
- q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
- q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
- q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
- q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
-
- d2s16 = vdup_n_s16((int16_t)cospi_12_64);
- d3s16 = vdup_n_s16((int16_t)cospi_20_64);
-
- d8s16 = vqrshrn_n_s32(q2s32, 14);
- d9s16 = vqrshrn_n_s32(q3s32, 14);
- d14s16 = vqrshrn_n_s32(q5s32, 14);
- d15s16 = vqrshrn_n_s32(q6s32, 14);
- q4s16 = vcombine_s16(d8s16, d9s16);
- q7s16 = vcombine_s16(d14s16, d15s16);
-
- q2s32 = vmull_s16(d26s16, d2s16);
- q3s32 = vmull_s16(d27s16, d2s16);
- q9s32 = vmull_s16(d26s16, d3s16);
- q15s32 = vmull_s16(d27s16, d3s16);
-
- q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
- q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
- q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
- q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
-
- d10s16 = vqrshrn_n_s32(q2s32, 14);
- d11s16 = vqrshrn_n_s32(q3s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q15s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
+ s = load_tran_low_to_s16q(*input);
+ vst1q_s16(*out, s);
+ *input += 8;
+ *out += 8;
+}
- // stage 4
- d30s16 = vdup_n_s16((int16_t)cospi_16_64);
-
- q2s32 = vmull_s16(d16s16, d30s16);
- q11s32 = vmull_s16(d17s16, d30s16);
- q0s32 = vmull_s16(d24s16, d30s16);
- q1s32 = vmull_s16(d25s16, d30s16);
-
- d30s16 = vdup_n_s16((int16_t)cospi_24_64);
- d31s16 = vdup_n_s16((int16_t)cospi_8_64);
-
- q3s32 = vaddq_s32(q2s32, q0s32);
- q12s32 = vaddq_s32(q11s32, q1s32);
- q13s32 = vsubq_s32(q2s32, q0s32);
- q1s32 = vsubq_s32(q11s32, q1s32);
-
- d16s16 = vqrshrn_n_s32(q3s32, 14);
- d17s16 = vqrshrn_n_s32(q12s32, 14);
- d18s16 = vqrshrn_n_s32(q13s32, 14);
- d19s16 = vqrshrn_n_s32(q1s32, 14);
- q8s16 = vcombine_s16(d16s16, d17s16);
- q9s16 = vcombine_s16(d18s16, d19s16);
-
- q0s32 = vmull_s16(d20s16, d31s16);
- q1s32 = vmull_s16(d21s16, d31s16);
- q12s32 = vmull_s16(d20s16, d30s16);
- q13s32 = vmull_s16(d21s16, d30s16);
-
- q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
- q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
- q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
- q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
-
- d22s16 = vqrshrn_n_s32(q0s32, 14);
- d23s16 = vqrshrn_n_s32(q1s32, 14);
- d20s16 = vqrshrn_n_s32(q12s32, 14);
- d21s16 = vqrshrn_n_s32(q13s32, 14);
- q10s16 = vcombine_s16(d20s16, d21s16);
- q11s16 = vcombine_s16(d22s16, d23s16);
-
- q13s16 = vsubq_s16(q4s16, q5s16);
- q4s16 = vaddq_s16(q4s16, q5s16);
- q14s16 = vsubq_s16(q7s16, q6s16);
- q15s16 = vaddq_s16(q6s16, q7s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
+static INLINE void idct16x16_256_add_load_tran_low(const tran_low_t *input,
+ int16_t *out) {
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+ idct16x16_256_add_load_tran_low_kernel(&input, &out);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
- // stage 5
- q0s16 = vaddq_s16(q8s16, q11s16);
- q1s16 = vaddq_s16(q9s16, q10s16);
- q2s16 = vsubq_s16(q9s16, q10s16);
- q3s16 = vsubq_s16(q8s16, q11s16);
-
- d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-
- q11s32 = vmull_s16(d26s16, d16s16);
- q12s32 = vmull_s16(d27s16, d16s16);
- q9s32 = vmull_s16(d28s16, d16s16);
- q10s32 = vmull_s16(d29s16, d16s16);
-
- q6s32 = vsubq_s32(q9s32, q11s32);
- q13s32 = vsubq_s32(q10s32, q12s32);
- q9s32 = vaddq_s32(q9s32, q11s32);
- q10s32 = vaddq_s32(q10s32, q12s32);
-
- d10s16 = vqrshrn_n_s32(q6s32, 14);
- d11s16 = vqrshrn_n_s32(q13s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q10s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
+static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
+ int16x4_t *const d1) {
+ *d0 = vrshrn_n_s32(t32[0], 14);
+ *d1 = vrshrn_n_s32(t32[1], 14);
+}
- // stage 6
- q8s16 = vaddq_s16(q0s16, q15s16);
- q9s16 = vaddq_s16(q1s16, q6s16);
- q10s16 = vaddq_s16(q2s16, q5s16);
- q11s16 = vaddq_s16(q3s16, q4s16);
- q12s16 = vsubq_s16(q3s16, q4s16);
- q13s16 = vsubq_s16(q2s16, q5s16);
- q14s16 = vsubq_s16(q1s16, q6s16);
- q15s16 = vsubq_s16(q0s16, q15s16);
-
- d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
- d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
- d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
- d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
- d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
- d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
- d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
- d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
- d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
- d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
- d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
- // store the data
- output_stride >>= 1; // output_stride / 2, out is int16_t
- vst1_u64((uint64_t *)out, d16u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d17u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d18u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d19u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d20u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d21u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d22u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d23u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d24u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d25u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d26u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d27u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d28u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d29u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d30u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d31u64);
+static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_2_30_10_22,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[6];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
}
-void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
- int16_t *pass1Output, int16_t skip_adding,
- uint8_t *dest, int dest_stride) {
- uint8_t *d;
- uint8x8_t d12u8, d13u8;
- int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- uint64x1_t d24u64, d25u64, d26u64, d27u64;
- int64x1_t d12s64, d13s64;
- uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
- uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32;
- int16x8x2_t q0x2s16;
-
- q0x2s16 = vld2q_s16(src);
- q8s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q9s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q10s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q11s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q12s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q13s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q14s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q15s16 = q0x2s16.val[0];
-
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
- d30s16 = vget_low_s16(q15s16);
- d31s16 = vget_high_s16(q15s16);
+static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_4_12_20N_28,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[6];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
- // stage 3
- d12s16 = vdup_n_s16((int16_t)cospi_30_64);
- d13s16 = vdup_n_s16((int16_t)cospi_2_64);
-
- q2s32 = vmull_s16(d16s16, d12s16);
- q3s32 = vmull_s16(d17s16, d12s16);
- q1s32 = vmull_s16(d16s16, d13s16);
- q4s32 = vmull_s16(d17s16, d13s16);
-
- q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
- q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
- q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
- q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
-
- d0s16 = vqrshrn_n_s32(q2s32, 14);
- d1s16 = vqrshrn_n_s32(q3s32, 14);
- d14s16 = vqrshrn_n_s32(q1s32, 14);
- d15s16 = vqrshrn_n_s32(q4s32, 14);
- q0s16 = vcombine_s16(d0s16, d1s16);
- q7s16 = vcombine_s16(d14s16, d15s16);
-
- d30s16 = vdup_n_s16((int16_t)cospi_14_64);
- d31s16 = vdup_n_s16((int16_t)cospi_18_64);
-
- q2s32 = vmull_s16(d24s16, d30s16);
- q3s32 = vmull_s16(d25s16, d30s16);
- q4s32 = vmull_s16(d24s16, d31s16);
- q5s32 = vmull_s16(d25s16, d31s16);
-
- q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
- q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
- q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
- q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
-
- d2s16 = vqrshrn_n_s32(q2s32, 14);
- d3s16 = vqrshrn_n_s32(q3s32, 14);
- d12s16 = vqrshrn_n_s32(q4s32, 14);
- d13s16 = vqrshrn_n_s32(q5s32, 14);
- q1s16 = vcombine_s16(d2s16, d3s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- d30s16 = vdup_n_s16((int16_t)cospi_22_64);
- d31s16 = vdup_n_s16((int16_t)cospi_10_64);
-
- q11s32 = vmull_s16(d20s16, d30s16);
- q12s32 = vmull_s16(d21s16, d30s16);
- q4s32 = vmull_s16(d20s16, d31s16);
- q5s32 = vmull_s16(d21s16, d31s16);
-
- q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
- q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
- q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
- q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
-
- d4s16 = vqrshrn_n_s32(q11s32, 14);
- d5s16 = vqrshrn_n_s32(q12s32, 14);
- d11s16 = vqrshrn_n_s32(q5s32, 14);
- d10s16 = vqrshrn_n_s32(q4s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- d30s16 = vdup_n_s16((int16_t)cospi_6_64);
- d31s16 = vdup_n_s16((int16_t)cospi_26_64);
-
- q10s32 = vmull_s16(d28s16, d30s16);
- q11s32 = vmull_s16(d29s16, d30s16);
- q12s32 = vmull_s16(d28s16, d31s16);
- q13s32 = vmull_s16(d29s16, d31s16);
-
- q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
- q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
- q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
- q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
-
- d6s16 = vqrshrn_n_s32(q10s32, 14);
- d7s16 = vqrshrn_n_s32(q11s32, 14);
- d8s16 = vqrshrn_n_s32(q12s32, 14);
- d9s16 = vqrshrn_n_s32(q13s32, 14);
- q3s16 = vcombine_s16(d6s16, d7s16);
- q4s16 = vcombine_s16(d8s16, d9s16);
+static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_6_26_14_18N,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[6];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 0);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 0);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 0);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 0);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 1);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 1);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 1);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 1);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
+ const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int32x4_t *const t32) {
+ t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
+ t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
+ t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
+}
+
+static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x4_t *const d0, int16x4_t *const d1) {
+ int32x4_t t32[2];
+
+ idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+ wrap_low_4x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x4_t *const d0,
+ int16x4_t *const d1) {
+ int32x4_t t32[2];
+
+ idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+ t32[1] = vnegq_s32(t32[1]);
+ wrap_low_4x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_2_30_10_22,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[6];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_4_12_20N_28,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[6];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_6_26_14_18N,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[6];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 2);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 2);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 2);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 2);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 3);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 3);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 3);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 3);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x4_t *const d0,
+ int16x4_t *const d1) {
+ int32x4_t t32[3];
+
+ t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
+ t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+ t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+ wrap_low_4x2(t32, d0, d1);
+}
+
+static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output,
+ uint8_t *dest, int stride) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
+ const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
+ const int16x4_t cospi_6_26_14_18N = vget_high_s16(cospis1);
+ int16x8_t in[16], step1[16], step2[16], out[16];
+
+ // Load input (16x8)
+ in[0] = vld1q_s16(input);
+ input += 8;
+ in[8] = vld1q_s16(input);
+ input += 8;
+ in[1] = vld1q_s16(input);
+ input += 8;
+ in[9] = vld1q_s16(input);
+ input += 8;
+ in[2] = vld1q_s16(input);
+ input += 8;
+ in[10] = vld1q_s16(input);
+ input += 8;
+ in[3] = vld1q_s16(input);
+ input += 8;
+ in[11] = vld1q_s16(input);
+ input += 8;
+ in[4] = vld1q_s16(input);
+ input += 8;
+ in[12] = vld1q_s16(input);
+ input += 8;
+ in[5] = vld1q_s16(input);
+ input += 8;
+ in[13] = vld1q_s16(input);
+ input += 8;
+ in[6] = vld1q_s16(input);
+ input += 8;
+ in[14] = vld1q_s16(input);
+ input += 8;
+ in[7] = vld1q_s16(input);
+ input += 8;
+ in[15] = vld1q_s16(input);
+
+ // Transpose
+ transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+ transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+ &in[15]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[1] = in[16 / 2];
+ step1[2] = in[8 / 2];
+ step1[3] = in[24 / 2];
+ step1[4] = in[4 / 2];
+ step1[5] = in[20 / 2];
+ step1[6] = in[12 / 2];
+ step1[7] = in[28 / 2];
+ step1[8] = in[2 / 2];
+ step1[9] = in[18 / 2];
+ step1[10] = in[10 / 2];
+ step1[11] = in[26 / 2];
+ step1[12] = in[6 / 2];
+ step1[13] = in[22 / 2];
+ step1[14] = in[14 / 2];
+ step1[15] = in[30 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+ idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
+ idct_cospi_14_18(step1[9], step1[14], cospi_6_26_14_18N, &step2[9],
+ &step2[14]);
+ idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
+ &step2[13]);
+ idct_cospi_6_26(step1[11], step1[12], cospi_6_26_14_18N, &step2[11],
+ &step2[12]);
// stage 3
- q9s16 = vsubq_s16(q0s16, q1s16);
- q0s16 = vaddq_s16(q0s16, q1s16);
- q10s16 = vsubq_s16(q3s16, q2s16);
- q11s16 = vaddq_s16(q2s16, q3s16);
- q12s16 = vaddq_s16(q4s16, q5s16);
- q13s16 = vsubq_s16(q4s16, q5s16);
- q14s16 = vsubq_s16(q7s16, q6s16);
- q7s16 = vaddq_s16(q6s16, q7s16);
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
+ idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
+ step1[8] = vaddq_s16(step2[8], step2[9]);
+ step1[9] = vsubq_s16(step2[8], step2[9]);
+ step1[10] = vsubq_s16(step2[11], step2[10]);
+ step1[11] = vaddq_s16(step2[11], step2[10]);
+ step1[12] = vaddq_s16(step2[12], step2[13]);
+ step1[13] = vsubq_s16(step2[12], step2[13]);
+ step1[14] = vsubq_s16(step2[15], step2[14]);
+ step1[15] = vaddq_s16(step2[15], step2[14]);
// stage 4
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
-
- d30s16 = vdup_n_s16((int16_t)cospi_8_64);
- d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
- q2s32 = vmull_s16(d18s16, d31s16);
- q3s32 = vmull_s16(d19s16, d31s16);
- q4s32 = vmull_s16(d28s16, d31s16);
- q5s32 = vmull_s16(d29s16, d31s16);
-
- q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
- q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
- q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
- q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
-
- d12s16 = vqrshrn_n_s32(q2s32, 14);
- d13s16 = vqrshrn_n_s32(q3s32, 14);
- d2s16 = vqrshrn_n_s32(q4s32, 14);
- d3s16 = vqrshrn_n_s32(q5s32, 14);
- q1s16 = vcombine_s16(d2s16, d3s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- q3s16 = q11s16;
- q4s16 = q12s16;
-
- d30s16 = vdup_n_s16(-cospi_8_64);
- q11s32 = vmull_s16(d26s16, d30s16);
- q12s32 = vmull_s16(d27s16, d30s16);
- q8s32 = vmull_s16(d20s16, d30s16);
- q9s32 = vmull_s16(d21s16, d30s16);
-
- q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
- q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
- q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
- q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
-
- d4s16 = vqrshrn_n_s32(q11s32, 14);
- d5s16 = vqrshrn_n_s32(q12s32, 14);
- d10s16 = vqrshrn_n_s32(q8s32, 14);
- d11s16 = vqrshrn_n_s32(q9s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
+ idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
+ idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
// stage 5
- q8s16 = vaddq_s16(q0s16, q3s16);
- q9s16 = vaddq_s16(q1s16, q2s16);
- q10s16 = vsubq_s16(q1s16, q2s16);
- q11s16 = vsubq_s16(q0s16, q3s16);
- q12s16 = vsubq_s16(q7s16, q4s16);
- q13s16 = vsubq_s16(q6s16, q5s16);
- q14s16 = vaddq_s16(q6s16, q5s16);
- q15s16 = vaddq_s16(q7s16, q4s16);
+ step1[0] = vaddq_s16(step2[0], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s16(step2[8], step2[11]);
+ step1[9] = vaddq_s16(step2[9], step2[10]);
+ step1[10] = vsubq_s16(step2[9], step2[10]);
+ step1[11] = vsubq_s16(step2[8], step2[11]);
+ step1[12] = vsubq_s16(step2[15], step2[12]);
+ step1[13] = vsubq_s16(step2[14], step2[13]);
+ step1[14] = vaddq_s16(step2[14], step2[13]);
+ step1[15] = vaddq_s16(step2[15], step2[12]);
// stage 6
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
-
- d14s16 = vdup_n_s16((int16_t)cospi_16_64);
-
- q3s32 = vmull_s16(d26s16, d14s16);
- q4s32 = vmull_s16(d27s16, d14s16);
- q0s32 = vmull_s16(d20s16, d14s16);
- q1s32 = vmull_s16(d21s16, d14s16);
-
- q5s32 = vsubq_s32(q3s32, q0s32);
- q6s32 = vsubq_s32(q4s32, q1s32);
- q10s32 = vaddq_s32(q3s32, q0s32);
- q4s32 = vaddq_s32(q4s32, q1s32);
-
- d4s16 = vqrshrn_n_s32(q5s32, 14);
- d5s16 = vqrshrn_n_s32(q6s32, 14);
- d10s16 = vqrshrn_n_s32(q10s32, 14);
- d11s16 = vqrshrn_n_s32(q4s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- q0s32 = vmull_s16(d22s16, d14s16);
- q1s32 = vmull_s16(d23s16, d14s16);
- q13s32 = vmull_s16(d24s16, d14s16);
- q6s32 = vmull_s16(d25s16, d14s16);
-
- q10s32 = vsubq_s32(q13s32, q0s32);
- q4s32 = vsubq_s32(q6s32, q1s32);
- q13s32 = vaddq_s32(q13s32, q0s32);
- q6s32 = vaddq_s32(q6s32, q1s32);
-
- d6s16 = vqrshrn_n_s32(q10s32, 14);
- d7s16 = vqrshrn_n_s32(q4s32, 14);
- d8s16 = vqrshrn_n_s32(q13s32, 14);
- d9s16 = vqrshrn_n_s32(q6s32, 14);
- q3s16 = vcombine_s16(d6s16, d7s16);
- q4s16 = vcombine_s16(d8s16, d9s16);
+ step2[0] = vaddq_s16(step1[0], step1[7]);
+ step2[1] = vaddq_s16(step1[1], step1[6]);
+ step2[2] = vaddq_s16(step1[2], step1[5]);
+ step2[3] = vaddq_s16(step1[3], step1[4]);
+ step2[4] = vsubq_s16(step1[3], step1[4]);
+ step2[5] = vsubq_s16(step1[2], step1[5]);
+ step2[6] = vsubq_s16(step1[1], step1[6]);
+ step2[7] = vsubq_s16(step1[0], step1[7]);
+ idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
// stage 7
- if (skip_adding != 0) {
- d = dest;
- // load the data in pass1
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
-
- q12s16 = vaddq_s16(q0s16, q15s16);
- q13s16 = vaddq_s16(q1s16, q14s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 =
- vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
- q13u16 =
- vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q14s16 = vsubq_s16(q1s16, q14s16);
- q15s16 = vsubq_s16(q0s16, q15s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q12s16 = vaddq_s16(q10s16, q5s16);
- q13s16 = vaddq_s16(q11s16, q4s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 =
- vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
- q13u16 =
- vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q4s16 = vsubq_s16(q11s16, q4s16);
- q5s16 = vsubq_s16(q10s16, q5s16);
-
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q12s16 = vaddq_s16(q0s16, q3s16);
- q13s16 = vaddq_s16(q1s16, q2s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 =
- vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
- q13u16 =
- vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q2s16 = vsubq_s16(q1s16, q2s16);
- q3s16 = vsubq_s16(q0s16, q3s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- d13s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q12s16 = vaddq_s16(q10s16, q9s16);
- q13s16 = vaddq_s16(q11s16, q8s16);
- q12s16 = vrshrq_n_s16(q12s16, 6);
- q13s16 = vrshrq_n_s16(q13s16, 6);
- q12u16 =
- vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
- q13u16 =
- vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
- d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
- d += dest_stride;
- q8s16 = vsubq_s16(q11s16, q8s16);
- q9s16 = vsubq_s16(q10s16, q9s16);
-
- // store the data out 8,9,10,11,12,13,14,15
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q8s16 = vrshrq_n_s16(q8s16, 6);
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q9s16 = vrshrq_n_s16(q9s16, 6);
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q2s16 = vrshrq_n_s16(q2s16, 6);
- q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q3s16 = vrshrq_n_s16(q3s16, 6);
- q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q4s16 = vrshrq_n_s16(q4s16, 6);
- q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q5s16 = vrshrq_n_s16(q5s16, 6);
- q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- dest += dest_stride;
- q14s16 = vrshrq_n_s16(q14s16, 6);
- q14u16 =
- vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- d += dest_stride;
-
- d12s64 = vld1_s64((int64_t *)dest);
- q15s16 = vrshrq_n_s16(q15s16, 6);
- q15u16 =
- vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
- d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
- vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
- } else { // skip_adding_dest
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q0s16, q15s16);
- q13s16 = vaddq_s16(q1s16, q14s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q14s16 = vsubq_s16(q1s16, q14s16);
- q15s16 = vsubq_s16(q0s16, q15s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q10s16, q5s16);
- q13s16 = vaddq_s16(q11s16, q4s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q4s16 = vsubq_s16(q11s16, q4s16);
- q5s16 = vsubq_s16(q10s16, q5s16);
-
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q0s16, q3s16);
- q13s16 = vaddq_s16(q1s16, q2s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q2s16 = vsubq_s16(q1s16, q2s16);
- q3s16 = vsubq_s16(q0s16, q3s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q10s16, q9s16);
- q13s16 = vaddq_s16(q11s16, q8s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q8s16 = vsubq_s16(q11s16, q8s16);
- q9s16 = vsubq_s16(q10s16, q9s16);
-
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
- out += 12;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
- out += 4;
- vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+ out[0] = vaddq_s16(step2[0], step2[15]);
+ out[1] = vaddq_s16(step2[1], step2[14]);
+ out[2] = vaddq_s16(step2[2], step2[13]);
+ out[3] = vaddq_s16(step2[3], step2[12]);
+ out[4] = vaddq_s16(step2[4], step2[11]);
+ out[5] = vaddq_s16(step2[5], step2[10]);
+ out[6] = vaddq_s16(step2[6], step2[9]);
+ out[7] = vaddq_s16(step2[7], step2[8]);
+ out[8] = vsubq_s16(step2[7], step2[8]);
+ out[9] = vsubq_s16(step2[6], step2[9]);
+ out[10] = vsubq_s16(step2[5], step2[10]);
+ out[11] = vsubq_s16(step2[4], step2[11]);
+ out[12] = vsubq_s16(step2[3], step2[12]);
+ out[13] = vsubq_s16(step2[2], step2[13]);
+ out[14] = vsubq_s16(step2[1], step2[14]);
+ out[15] = vsubq_s16(step2[0], step2[15]);
+
+ if (output) {
+ // pass 1: save the result into output
+ vst1q_s16(output, out[0]);
+ output += 16;
+ vst1q_s16(output, out[1]);
+ output += 16;
+ vst1q_s16(output, out[2]);
+ output += 16;
+ vst1q_s16(output, out[3]);
+ output += 16;
+ vst1q_s16(output, out[4]);
+ output += 16;
+ vst1q_s16(output, out[5]);
+ output += 16;
+ vst1q_s16(output, out[6]);
+ output += 16;
+ vst1q_s16(output, out[7]);
+ output += 16;
+ vst1q_s16(output, out[8]);
+ output += 16;
+ vst1q_s16(output, out[9]);
+ output += 16;
+ vst1q_s16(output, out[10]);
+ output += 16;
+ vst1q_s16(output, out[11]);
+ output += 16;
+ vst1q_s16(output, out[12]);
+ output += 16;
+ vst1q_s16(output, out[13]);
+ output += 16;
+ vst1q_s16(output, out[14]);
+ output += 16;
+ vst1q_s16(output, out[15]);
+ } else {
+ // pass 2: add the result to dest.
+ idct16x16_add8x1(out[0], &dest, stride);
+ idct16x16_add8x1(out[1], &dest, stride);
+ idct16x16_add8x1(out[2], &dest, stride);
+ idct16x16_add8x1(out[3], &dest, stride);
+ idct16x16_add8x1(out[4], &dest, stride);
+ idct16x16_add8x1(out[5], &dest, stride);
+ idct16x16_add8x1(out[6], &dest, stride);
+ idct16x16_add8x1(out[7], &dest, stride);
+ idct16x16_add8x1(out[8], &dest, stride);
+ idct16x16_add8x1(out[9], &dest, stride);
+ idct16x16_add8x1(out[10], &dest, stride);
+ idct16x16_add8x1(out[11], &dest, stride);
+ idct16x16_add8x1(out[12], &dest, stride);
+ idct16x16_add8x1(out[13], &dest, stride);
+ idct16x16_add8x1(out[14], &dest, stride);
+ idct16x16_add8x1(out[15], &dest, stride);
}
}
-void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
- int output_stride) {
- int16x4_t d4s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
- uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
- int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q6s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q15s32;
- int16x8x2_t q0x2s16;
-
- q0x2s16 = vld2q_s16(in);
- q8s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q9s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q10s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q11s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q12s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q13s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q14s16 = q0x2s16.val[0];
- in += 16;
- q0x2s16 = vld2q_s16(in);
- q15s16 = q0x2s16.val[0];
-
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
+static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+ int16_t *output) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+ const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+ const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+ const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+ const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+ int16x4_t in[4], step1[16], step2[16], out[16];
+
+// Load input (4x4)
+#if CONFIG_VP9_HIGHBITDEPTH
+ in[0] = load_tran_low_to_s16d(input);
+ input += 16;
+ in[1] = load_tran_low_to_s16d(input);
+ input += 16;
+ in[2] = load_tran_low_to_s16d(input);
+ input += 16;
+ in[3] = load_tran_low_to_s16d(input);
+#else
+ in[0] = vld1_s16(input);
+ input += 16;
+ in[1] = vld1_s16(input);
+ input += 16;
+ in[2] = vld1_s16(input);
+ input += 16;
+ in[3] = vld1_s16(input);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Transpose
+ transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
+ step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+ step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+ step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
// stage 3
- q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
- q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
-
- q4s16 = vqrdmulhq_s16(q9s16, q0s16);
- q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+ step1[0] = step2[0];
+ step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+ step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
// stage 4
- q1s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
- d4s16 = vdup_n_s16((int16_t)cospi_16_64);
-
- q8s16 = vqrdmulhq_s16(q8s16, q1s16);
-
- d8s16 = vget_low_s16(q4s16);
- d9s16 = vget_high_s16(q4s16);
- d14s16 = vget_low_s16(q7s16);
- d15s16 = vget_high_s16(q7s16);
- q9s32 = vmull_s16(d14s16, d4s16);
- q10s32 = vmull_s16(d15s16, d4s16);
- q12s32 = vmull_s16(d9s16, d4s16);
- q11s32 = vmull_s16(d8s16, d4s16);
-
- q15s32 = vsubq_s32(q10s32, q12s32);
- q6s32 = vsubq_s32(q9s32, q11s32);
- q9s32 = vaddq_s32(q9s32, q11s32);
- q10s32 = vaddq_s32(q10s32, q12s32);
-
- d11s16 = vqrshrn_n_s32(q15s32, 14);
- d10s16 = vqrshrn_n_s32(q6s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q10s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
+ step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vadd_s16(step2[8], step2[11]);
+ step1[9] = vadd_s16(step2[9], step2[10]);
+ step1[10] = vsub_s16(step2[9], step2[10]);
+ step1[11] = vsub_s16(step2[8], step2[11]);
+ step1[12] = vsub_s16(step2[15], step2[12]);
+ step1[13] = vsub_s16(step2[14], step2[13]);
+ step1[14] = vadd_s16(step2[14], step2[13]);
+ step1[15] = vadd_s16(step2[15], step2[12]);
// stage 6
- q2s16 = vaddq_s16(q8s16, q7s16);
- q9s16 = vaddq_s16(q8s16, q6s16);
- q10s16 = vaddq_s16(q8s16, q5s16);
- q11s16 = vaddq_s16(q8s16, q4s16);
- q12s16 = vsubq_s16(q8s16, q4s16);
- q13s16 = vsubq_s16(q8s16, q5s16);
- q14s16 = vsubq_s16(q8s16, q6s16);
- q15s16 = vsubq_s16(q8s16, q7s16);
-
- d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
- d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
- d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
- d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
- d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
- d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
- d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
- d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
- d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
- d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
- d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
- // store the data
- output_stride >>= 1; // output_stride / 2, out is int16_t
- vst1_u64((uint64_t *)out, d4u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d5u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d18u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d19u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d20u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d21u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d22u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d23u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d24u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d25u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d26u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d27u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d28u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d29u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d30u64);
- out += output_stride;
- vst1_u64((uint64_t *)out, d31u64);
+ step2[0] = vadd_s16(step1[0], step1[7]);
+ step2[1] = vadd_s16(step1[1], step1[6]);
+ step2[2] = vadd_s16(step1[2], step1[5]);
+ step2[3] = vadd_s16(step1[3], step1[4]);
+ step2[4] = vsub_s16(step1[3], step1[4]);
+ step2[5] = vsub_s16(step1[2], step1[5]);
+ step2[6] = vsub_s16(step1[1], step1[6]);
+ step2[7] = vsub_s16(step1[0], step1[7]);
+ idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ out[0] = vadd_s16(step2[0], step2[15]);
+ out[1] = vadd_s16(step2[1], step2[14]);
+ out[2] = vadd_s16(step2[2], step2[13]);
+ out[3] = vadd_s16(step2[3], step2[12]);
+ out[4] = vadd_s16(step2[4], step2[11]);
+ out[5] = vadd_s16(step2[5], step2[10]);
+ out[6] = vadd_s16(step2[6], step2[9]);
+ out[7] = vadd_s16(step2[7], step2[8]);
+ out[8] = vsub_s16(step2[7], step2[8]);
+ out[9] = vsub_s16(step2[6], step2[9]);
+ out[10] = vsub_s16(step2[5], step2[10]);
+ out[11] = vsub_s16(step2[4], step2[11]);
+ out[12] = vsub_s16(step2[3], step2[12]);
+ out[13] = vsub_s16(step2[2], step2[13]);
+ out[14] = vsub_s16(step2[1], step2[14]);
+ out[15] = vsub_s16(step2[0], step2[15]);
+
+ // pass 1: save the result into output
+ vst1_s16(output, out[0]);
+ output += 4;
+ vst1_s16(output, out[1]);
+ output += 4;
+ vst1_s16(output, out[2]);
+ output += 4;
+ vst1_s16(output, out[3]);
+ output += 4;
+ vst1_s16(output, out[4]);
+ output += 4;
+ vst1_s16(output, out[5]);
+ output += 4;
+ vst1_s16(output, out[6]);
+ output += 4;
+ vst1_s16(output, out[7]);
+ output += 4;
+ vst1_s16(output, out[8]);
+ output += 4;
+ vst1_s16(output, out[9]);
+ output += 4;
+ vst1_s16(output, out[10]);
+ output += 4;
+ vst1_s16(output, out[11]);
+ output += 4;
+ vst1_s16(output, out[12]);
+ output += 4;
+ vst1_s16(output, out[13]);
+ output += 4;
+ vst1_s16(output, out[14]);
+ output += 4;
+ vst1_s16(output, out[15]);
}
-void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
- int16_t *pass1Output, int16_t skip_adding,
- uint8_t *dest, int dest_stride) {
- int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
- uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
- uint64x1_t d16u64, d17u64, d18u64, d19u64;
- uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32;
- int16x8x2_t q0x2s16;
- (void)skip_adding;
- (void)dest;
- (void)dest_stride;
-
- q0x2s16 = vld2q_s16(src);
- q8s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q9s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q10s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q11s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q12s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q13s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q14s16 = q0x2s16.val[0];
- src += 16;
- q0x2s16 = vld2q_s16(src);
- q15s16 = q0x2s16.val[0];
-
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
+static void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *output,
+ uint8_t *dest, int stride) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+ const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+ const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+ const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+ const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+ int16x4_t ind[8];
+ int16x8_t in[4], step1[16], step2[16], out[16];
+
+ // Load input (4x8)
+ ind[0] = vld1_s16(input);
+ input += 4;
+ ind[1] = vld1_s16(input);
+ input += 4;
+ ind[2] = vld1_s16(input);
+ input += 4;
+ ind[3] = vld1_s16(input);
+ input += 4;
+ ind[4] = vld1_s16(input);
+ input += 4;
+ ind[5] = vld1_s16(input);
+ input += 4;
+ ind[6] = vld1_s16(input);
+ input += 4;
+ ind[7] = vld1_s16(input);
+
+ // Transpose
+ transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
+ ind[7], &in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
+ step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+ step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+ step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
// stage 3
- q6s16 = vdupq_n_s16((int16_t)cospi_30_64 * 2);
- q0s16 = vqrdmulhq_s16(q8s16, q6s16);
- q6s16 = vdupq_n_s16((int16_t)cospi_2_64 * 2);
- q7s16 = vqrdmulhq_s16(q8s16, q6s16);
-
- q15s16 = vdupq_n_s16((int16_t)-cospi_26_64 * 2);
- q14s16 = vdupq_n_s16((int16_t)cospi_6_64 * 2);
- q3s16 = vqrdmulhq_s16(q9s16, q15s16);
- q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+ step1[0] = step2[0];
+ step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+ step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
// stage 4
- d0s16 = vget_low_s16(q0s16);
- d1s16 = vget_high_s16(q0s16);
- d6s16 = vget_low_s16(q3s16);
- d7s16 = vget_high_s16(q3s16);
- d8s16 = vget_low_s16(q4s16);
- d9s16 = vget_high_s16(q4s16);
- d14s16 = vget_low_s16(q7s16);
- d15s16 = vget_high_s16(q7s16);
-
- d30s16 = vdup_n_s16((int16_t)cospi_8_64);
- d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
- q12s32 = vmull_s16(d14s16, d31s16);
- q5s32 = vmull_s16(d15s16, d31s16);
- q2s32 = vmull_s16(d0s16, d31s16);
- q11s32 = vmull_s16(d1s16, d31s16);
-
- q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
- q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
- q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
- q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
-
- d2s16 = vqrshrn_n_s32(q12s32, 14);
- d3s16 = vqrshrn_n_s32(q5s32, 14);
- d12s16 = vqrshrn_n_s32(q2s32, 14);
- d13s16 = vqrshrn_n_s32(q11s32, 14);
- q1s16 = vcombine_s16(d2s16, d3s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- d30s16 = vdup_n_s16(-cospi_8_64);
- q10s32 = vmull_s16(d8s16, d30s16);
- q13s32 = vmull_s16(d9s16, d30s16);
- q8s32 = vmull_s16(d6s16, d30s16);
- q9s32 = vmull_s16(d7s16, d30s16);
-
- q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
- q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
- q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
- q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
-
- d4s16 = vqrshrn_n_s32(q10s32, 14);
- d5s16 = vqrshrn_n_s32(q13s32, 14);
- d10s16 = vqrshrn_n_s32(q8s32, 14);
- d11s16 = vqrshrn_n_s32(q9s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
+ step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
// stage 5
- q8s16 = vaddq_s16(q0s16, q3s16);
- q9s16 = vaddq_s16(q1s16, q2s16);
- q10s16 = vsubq_s16(q1s16, q2s16);
- q11s16 = vsubq_s16(q0s16, q3s16);
- q12s16 = vsubq_s16(q7s16, q4s16);
- q13s16 = vsubq_s16(q6s16, q5s16);
- q14s16 = vaddq_s16(q6s16, q5s16);
- q15s16 = vaddq_s16(q7s16, q4s16);
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s16(step2[8], step2[11]);
+ step1[9] = vaddq_s16(step2[9], step2[10]);
+ step1[10] = vsubq_s16(step2[9], step2[10]);
+ step1[11] = vsubq_s16(step2[8], step2[11]);
+ step1[12] = vsubq_s16(step2[15], step2[12]);
+ step1[13] = vsubq_s16(step2[14], step2[13]);
+ step1[14] = vaddq_s16(step2[14], step2[13]);
+ step1[15] = vaddq_s16(step2[15], step2[12]);
// stage 6
- d20s16 = vget_low_s16(q10s16);
- d21s16 = vget_high_s16(q10s16);
- d22s16 = vget_low_s16(q11s16);
- d23s16 = vget_high_s16(q11s16);
- d24s16 = vget_low_s16(q12s16);
- d25s16 = vget_high_s16(q12s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
-
- d14s16 = vdup_n_s16((int16_t)cospi_16_64);
- q3s32 = vmull_s16(d26s16, d14s16);
- q4s32 = vmull_s16(d27s16, d14s16);
- q0s32 = vmull_s16(d20s16, d14s16);
- q1s32 = vmull_s16(d21s16, d14s16);
-
- q5s32 = vsubq_s32(q3s32, q0s32);
- q6s32 = vsubq_s32(q4s32, q1s32);
- q0s32 = vaddq_s32(q3s32, q0s32);
- q4s32 = vaddq_s32(q4s32, q1s32);
-
- d4s16 = vqrshrn_n_s32(q5s32, 14);
- d5s16 = vqrshrn_n_s32(q6s32, 14);
- d10s16 = vqrshrn_n_s32(q0s32, 14);
- d11s16 = vqrshrn_n_s32(q4s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- q0s32 = vmull_s16(d22s16, d14s16);
- q1s32 = vmull_s16(d23s16, d14s16);
- q13s32 = vmull_s16(d24s16, d14s16);
- q6s32 = vmull_s16(d25s16, d14s16);
-
- q10s32 = vsubq_s32(q13s32, q0s32);
- q4s32 = vsubq_s32(q6s32, q1s32);
- q13s32 = vaddq_s32(q13s32, q0s32);
- q6s32 = vaddq_s32(q6s32, q1s32);
-
- d6s16 = vqrshrn_n_s32(q10s32, 14);
- d7s16 = vqrshrn_n_s32(q4s32, 14);
- d8s16 = vqrshrn_n_s32(q13s32, 14);
- d9s16 = vqrshrn_n_s32(q6s32, 14);
- q3s16 = vcombine_s16(d6s16, d7s16);
- q4s16 = vcombine_s16(d8s16, d9s16);
+ step2[0] = vaddq_s16(step1[0], step1[7]);
+ step2[1] = vaddq_s16(step1[1], step1[6]);
+ step2[2] = vaddq_s16(step1[2], step1[5]);
+ step2[3] = vaddq_s16(step1[3], step1[4]);
+ step2[4] = vsubq_s16(step1[3], step1[4]);
+ step2[5] = vsubq_s16(step1[2], step1[5]);
+ step2[6] = vsubq_s16(step1[1], step1[6]);
+ step2[7] = vsubq_s16(step1[0], step1[7]);
+ idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
// stage 7
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q0s16, q15s16);
- q13s16 = vaddq_s16(q1s16, q14s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q14s16 = vsubq_s16(q1s16, q14s16);
- q15s16 = vsubq_s16(q0s16, q15s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q10s16, q5s16);
- q13s16 = vaddq_s16(q11s16, q4s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q4s16 = vsubq_s16(q11s16, q4s16);
- q5s16 = vsubq_s16(q10s16, q5s16);
-
- q0s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q1s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q12s16 = vaddq_s16(q0s16, q3s16);
- q13s16 = vaddq_s16(q1s16, q2s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q2s16 = vsubq_s16(q1s16, q2s16);
- q3s16 = vsubq_s16(q0s16, q3s16);
-
- q10s16 = vld1q_s16(pass1Output);
- pass1Output += 8;
- q11s16 = vld1q_s16(pass1Output);
- q12s16 = vaddq_s16(q10s16, q9s16);
- q13s16 = vaddq_s16(q11s16, q8s16);
- d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
- d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
- d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
- d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
- vst1_u64((uint64_t *)out, d24u64);
- out += 4;
- vst1_u64((uint64_t *)out, d25u64);
- out += 12;
- vst1_u64((uint64_t *)out, d26u64);
- out += 4;
- vst1_u64((uint64_t *)out, d27u64);
- out += 12;
- q8s16 = vsubq_s16(q11s16, q8s16);
- q9s16 = vsubq_s16(q10s16, q9s16);
-
- d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
- d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
- d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
- d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
- d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
- d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
- d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
- d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
- d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
- d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
- d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
- d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
- d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
- d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
- d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
- d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
- vst1_u64((uint64_t *)out, d16u64);
- out += 4;
- vst1_u64((uint64_t *)out, d17u64);
- out += 12;
- vst1_u64((uint64_t *)out, d18u64);
- out += 4;
- vst1_u64((uint64_t *)out, d19u64);
- out += 12;
- vst1_u64((uint64_t *)out, d4u64);
- out += 4;
- vst1_u64((uint64_t *)out, d5u64);
- out += 12;
- vst1_u64((uint64_t *)out, d6u64);
- out += 4;
- vst1_u64((uint64_t *)out, d7u64);
- out += 12;
- vst1_u64((uint64_t *)out, d8u64);
- out += 4;
- vst1_u64((uint64_t *)out, d9u64);
- out += 12;
- vst1_u64((uint64_t *)out, d10u64);
- out += 4;
- vst1_u64((uint64_t *)out, d11u64);
- out += 12;
- vst1_u64((uint64_t *)out, d28u64);
- out += 4;
- vst1_u64((uint64_t *)out, d29u64);
- out += 12;
- vst1_u64((uint64_t *)out, d30u64);
- out += 4;
- vst1_u64((uint64_t *)out, d31u64);
+ out[0] = vaddq_s16(step2[0], step2[15]);
+ out[1] = vaddq_s16(step2[1], step2[14]);
+ out[2] = vaddq_s16(step2[2], step2[13]);
+ out[3] = vaddq_s16(step2[3], step2[12]);
+ out[4] = vaddq_s16(step2[4], step2[11]);
+ out[5] = vaddq_s16(step2[5], step2[10]);
+ out[6] = vaddq_s16(step2[6], step2[9]);
+ out[7] = vaddq_s16(step2[7], step2[8]);
+ out[8] = vsubq_s16(step2[7], step2[8]);
+ out[9] = vsubq_s16(step2[6], step2[9]);
+ out[10] = vsubq_s16(step2[5], step2[10]);
+ out[11] = vsubq_s16(step2[4], step2[11]);
+ out[12] = vsubq_s16(step2[3], step2[12]);
+ out[13] = vsubq_s16(step2[2], step2[13]);
+ out[14] = vsubq_s16(step2[1], step2[14]);
+ out[15] = vsubq_s16(step2[0], step2[15]);
+
+ if (output) {
+ // pass 1: save the result into output
+ vst1q_s16(output, out[0]);
+ output += 16;
+ vst1q_s16(output, out[1]);
+ output += 16;
+ vst1q_s16(output, out[2]);
+ output += 16;
+ vst1q_s16(output, out[3]);
+ output += 16;
+ vst1q_s16(output, out[4]);
+ output += 16;
+ vst1q_s16(output, out[5]);
+ output += 16;
+ vst1q_s16(output, out[6]);
+ output += 16;
+ vst1q_s16(output, out[7]);
+ output += 16;
+ vst1q_s16(output, out[8]);
+ output += 16;
+ vst1q_s16(output, out[9]);
+ output += 16;
+ vst1q_s16(output, out[10]);
+ output += 16;
+ vst1q_s16(output, out[11]);
+ output += 16;
+ vst1q_s16(output, out[12]);
+ output += 16;
+ vst1q_s16(output, out[13]);
+ output += 16;
+ vst1q_s16(output, out[14]);
+ output += 16;
+ vst1q_s16(output, out[15]);
+ } else {
+ // pass 2: add the result to dest.
+ idct16x16_add8x1(out[0], &dest, stride);
+ idct16x16_add8x1(out[1], &dest, stride);
+ idct16x16_add8x1(out[2], &dest, stride);
+ idct16x16_add8x1(out[3], &dest, stride);
+ idct16x16_add8x1(out[4], &dest, stride);
+ idct16x16_add8x1(out[5], &dest, stride);
+ idct16x16_add8x1(out[6], &dest, stride);
+ idct16x16_add8x1(out[7], &dest, stride);
+ idct16x16_add8x1(out[8], &dest, stride);
+ idct16x16_add8x1(out[9], &dest, stride);
+ idct16x16_add8x1(out[10], &dest, stride);
+ idct16x16_add8x1(out[11], &dest, stride);
+ idct16x16_add8x1(out[12], &dest, stride);
+ idct16x16_add8x1(out[13], &dest, stride);
+ idct16x16_add8x1(out[14], &dest, stride);
+ idct16x16_add8x1(out[15], &dest, stride);
+ }
+}
+
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16_t row_idct_output[16 * 16];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ int16_t pass1_input[16 * 16];
+ idct16x16_256_add_load_tran_low(input, pass1_input);
+#else
+ const int16_t *pass1_input = input;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ idct16x16_256_add_half1d(pass1_input, row_idct_output, dest, stride);
+
+ // Parallel idct on the lower 8 rows
+ idct16x16_256_add_half1d(pass1_input + 8 * 16, row_idct_output + 8, dest,
+ stride);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride);
+
+ // Parallel idct to get the right 8 columns
+ idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride);
+}
+
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16_t row_idct_output[4 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride);
+
+ // Parallel idct to get the right 8 columns
+ idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
+ stride);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c
index bdbbf519332..47366bcb7d6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c
@@ -11,16 +11,29 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/vpx_dsp_common.h"
-void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
- int output_stride);
+void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output);
void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
- int16_t *pass1Output, int16_t skip_adding,
- uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
- int output_stride);
-void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
- int16_t *pass1Output, int16_t skip_adding,
- uint8_t *dest, int dest_stride);
+ int16_t *pass1_output,
+ int16_t skip_adding, uint8_t *dest,
+ int stride);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *input,
+ int16_t *output);
+void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
+ int16_t *output,
+ int16_t *pass1_output,
+ int16_t skip_adding,
+ uint8_t *dest, int stride);
+#else
+#define vpx_idct16x16_256_add_neon_pass1_tran_low \
+ vpx_idct16x16_256_add_neon_pass1
+#define vpx_idct16x16_256_add_neon_pass2_tran_low \
+ vpx_idct16x16_256_add_neon_pass2
+#endif
+
+void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *input, int16_t *output);
+void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output,
+ int16_t *pass1_output);
#if HAVE_NEON_ASM
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
@@ -28,8 +41,8 @@ extern void vpx_push_neon(int64_t *store);
extern void vpx_pop_neon(int64_t *store);
#endif // HAVE_NEON_ASM
-void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
@@ -44,47 +57,47 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
+ vpx_idct16x16_256_add_neon_pass1_tran_low(input, pass1_output);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
- dest, dest_stride);
+ vpx_idct16x16_256_add_neon_pass2_tran_low(input + 1, row_idct_output,
+ pass1_output, 0, dest, stride);
/* Parallel idct on the lower 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
+ vpx_idct16x16_256_add_neon_pass1_tran_low(input + 8 * 16, pass1_output);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
- pass1_output, 0, dest, dest_stride);
+ vpx_idct16x16_256_add_neon_pass2_tran_low(
+ input + 8 * 16 + 1, row_idct_output + 8, pass1_output, 0, dest, stride);
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+ vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
- pass1_output, 1, dest, dest_stride);
+ pass1_output, 1, dest, stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
+ vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
row_idct_output + 8, pass1_output, 1,
- dest + 8, dest_stride);
+ dest + 8, stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
@@ -92,8 +105,8 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
#endif
}
-void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
@@ -108,38 +121,37 @@ void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
+ vpx_idct16x16_10_add_neon_pass1(input, pass1_output);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
- dest, dest_stride);
+ vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output);
/* Skip Parallel idct on the lower 8 rows as they are all 0s */
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+ vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
- pass1_output, 1, dest, dest_stride);
+ pass1_output, 1, dest, stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
+ vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
row_idct_output + 8, pass1_output, 1,
- dest + 8, dest_stride);
+ dest + 8, stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
new file mode 100644
index 00000000000..28b94655848
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_8x8_s16(const tran_low_t *input, int16x8_t *const in0,
+ int16x8_t *const in1, int16x8_t *const in2,
+ int16x8_t *const in3, int16x8_t *const in4,
+ int16x8_t *const in5, int16x8_t *const in6,
+ int16x8_t *const in7) {
+ *in0 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in1 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in2 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in3 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in4 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in5 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in6 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in7 = load_tran_low_to_s16q(input);
+}
+
+static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0,
+ int16x4_t *const in1, int16x4_t *const in2,
+ int16x4_t *const in3, int16x4_t *const in4,
+ int16x4_t *const in5, int16x4_t *const in6,
+ int16x4_t *const in7) {
+ *in0 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in1 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in2 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in3 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in4 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in5 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in6 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in7 = load_tran_low_to_s16d(input);
+}
+
+// Only for the first pass of the _135_ variant. Since it only uses values from
+// the top left 16x16 it can safely assume all the remaining values are 0 and
+// skip an awful lot of calculations. In fact, only the first 12 columns make
+// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
+// used so it skips any calls to input[12|13|14|15] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 12x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
+// coefficients as follows:
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// 0 0 2 5 10 17 25 38 47 62 83 101 121
+// 1 1 4 8 15 22 30 45 58 74 92 112 133
+// 2 3 7 12 18 28 36 52 64 82 102 118
+// 3 6 11 16 23 31 43 60 73 90 109 126
+// 4 9 14 19 29 37 50 65 78 98 116 134
+// 5 13 20 26 35 44 54 72 85 105 123
+// 6 21 27 33 42 53 63 80 94 113 132
+// 7 24 32 39 48 57 71 88 104 120
+// 8 34 40 46 56 68 81 96 111 130
+// 9 41 49 55 67 77 91 107 124
+// 10 51 59 66 76 89 99 119 131
+// 11 61 69 75 87 100 114 129
+// 12 70 79 86 97 108 122
+// 13 84 93 103 110 125
+// 14 98 106 115 127
+// 15 117 128
+static void idct32_12_neon(const tran_low_t *input, int16_t *output) {
+ int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
+ int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int16x8_t in8, in9, in10, in11;
+ int16x8_t s1_16, s1_18, s1_19, s1_20, s1_21, s1_23, s1_24, s1_26, s1_27,
+ s1_28, s1_29, s1_31;
+ int16x8_t s2_8, s2_10, s2_11, s2_12, s2_13, s2_15, s2_18, s2_19, s2_20, s2_21,
+ s2_26, s2_27, s2_28, s2_29;
+ int16x8_t s3_4, s3_7, s3_10, s3_11, s3_12, s3_13, s3_17, s3_18, s3_21, s3_22,
+ s3_25, s3_26, s3_29, s3_30;
+ int16x8_t s4_0, s4_2, s4_3, s4_9, s4_10, s4_13, s4_14, s4_16, s4_17, s4_18,
+ s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25, s4_26, s4_27, s4_28,
+ s4_29, s4_30, s4_31;
+ int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
+ s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
+ s5_29;
+ int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
+ s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
+ s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
+ int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
+ s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
+ s7_25, s7_26, s7_27;
+
+ load_8x8_s16(input, &in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
+ transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
+
+ load_4x8_s16(input + 8, &tmp0, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &tmp6,
+ &tmp7);
+ transpose_s16_4x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, &in8, &in9,
+ &in10, &in11);
+
+ // stage 1
+ s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+ s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+ s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
+ s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+
+ s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+ s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+
+ s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+ s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+ s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
+ s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+
+ s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+ s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+ // stage 2
+ s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+ s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+ s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
+ s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+
+ s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+ s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+
+ s2_18 = vsubq_s16(s1_19, s1_18);
+ s2_19 = vaddq_s16(s1_18, s1_19);
+ s2_20 = vaddq_s16(s1_20, s1_21);
+ s2_21 = vsubq_s16(s1_20, s1_21);
+ s2_26 = vsubq_s16(s1_27, s1_26);
+ s2_27 = vaddq_s16(s1_26, s1_27);
+ s2_28 = vaddq_s16(s1_28, s1_29);
+ s2_29 = vsubq_s16(s1_28, s1_29);
+
+ // stage 3
+ s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+ s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+ s3_10 = vsubq_s16(s2_11, s2_10);
+ s3_11 = vaddq_s16(s2_10, s2_11);
+ s3_12 = vaddq_s16(s2_12, s2_13);
+ s3_13 = vsubq_s16(s2_12, s2_13);
+
+ s3_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+ cospi_28_64);
+ s3_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+ cospi_4_64);
+
+ s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
+ -cospi_4_64);
+ s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
+ cospi_28_64);
+
+ s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
+ cospi_12_64);
+ s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
+ cospi_20_64);
+
+ s3_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+ -cospi_20_64);
+ s3_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+ cospi_12_64);
+
+ // stage 4
+ s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+ s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
+ s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+
+ s4_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+ cospi_24_64);
+ s4_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+ cospi_8_64);
+
+ s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
+ -cospi_8_64);
+ s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+ cospi_24_64);
+
+ s4_16 = vaddq_s16(s1_16, s2_19);
+ s4_17 = vaddq_s16(s3_17, s3_18);
+ s4_18 = vsubq_s16(s3_17, s3_18);
+ s4_19 = vsubq_s16(s1_16, s2_19);
+ s4_20 = vsubq_s16(s1_23, s2_20);
+ s4_21 = vsubq_s16(s3_22, s3_21);
+ s4_22 = vaddq_s16(s3_21, s3_22);
+ s4_23 = vaddq_s16(s2_20, s1_23);
+ s4_24 = vaddq_s16(s1_24, s2_27);
+ s4_25 = vaddq_s16(s3_25, s3_26);
+ s4_26 = vsubq_s16(s3_25, s3_26);
+ s4_27 = vsubq_s16(s1_24, s2_27);
+ s4_28 = vsubq_s16(s1_31, s2_28);
+ s4_29 = vsubq_s16(s3_30, s3_29);
+ s4_30 = vaddq_s16(s3_29, s3_30);
+ s4_31 = vaddq_s16(s2_28, s1_31);
+
+ // stage 5
+ s5_0 = vaddq_s16(s4_0, s4_3);
+ s5_1 = vaddq_s16(s4_0, s4_2);
+ s5_2 = vsubq_s16(s4_0, s4_2);
+ s5_3 = vsubq_s16(s4_0, s4_3);
+
+ s5_5 = sub_multiply_shift_and_narrow_s16(s3_7, s3_4, cospi_16_64);
+ s5_6 = add_multiply_shift_and_narrow_s16(s3_4, s3_7, cospi_16_64);
+
+ s5_8 = vaddq_s16(s2_8, s3_11);
+ s5_9 = vaddq_s16(s4_9, s4_10);
+ s5_10 = vsubq_s16(s4_9, s4_10);
+ s5_11 = vsubq_s16(s2_8, s3_11);
+ s5_12 = vsubq_s16(s2_15, s3_12);
+ s5_13 = vsubq_s16(s4_14, s4_13);
+ s5_14 = vaddq_s16(s4_13, s4_14);
+ s5_15 = vaddq_s16(s2_15, s3_12);
+
+ s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
+ cospi_24_64);
+ s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
+ cospi_8_64);
+
+ s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
+ cospi_24_64);
+ s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
+ cospi_8_64);
+
+ s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
+ -cospi_8_64);
+ s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
+ cospi_24_64);
+
+ s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
+ -cospi_8_64);
+ s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
+ cospi_24_64);
+
+ // stage 6
+ s6_0 = vaddq_s16(s5_0, s3_7);
+ s6_1 = vaddq_s16(s5_1, s5_6);
+ s6_2 = vaddq_s16(s5_2, s5_5);
+ s6_3 = vaddq_s16(s5_3, s3_4);
+ s6_4 = vsubq_s16(s5_3, s3_4);
+ s6_5 = vsubq_s16(s5_2, s5_5);
+ s6_6 = vsubq_s16(s5_1, s5_6);
+ s6_7 = vsubq_s16(s5_0, s3_7);
+
+ s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
+ s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+
+ s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
+ s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+
+ s6_16 = vaddq_s16(s4_16, s4_23);
+ s6_17 = vaddq_s16(s4_17, s4_22);
+ s6_18 = vaddq_s16(s5_18, s5_21);
+ s6_19 = vaddq_s16(s5_19, s5_20);
+ s6_20 = vsubq_s16(s5_19, s5_20);
+ s6_21 = vsubq_s16(s5_18, s5_21);
+ s6_22 = vsubq_s16(s4_17, s4_22);
+ s6_23 = vsubq_s16(s4_16, s4_23);
+
+ s6_24 = vsubq_s16(s4_31, s4_24);
+ s6_25 = vsubq_s16(s4_30, s4_25);
+ s6_26 = vsubq_s16(s5_29, s5_26);
+ s6_27 = vsubq_s16(s5_28, s5_27);
+ s6_28 = vaddq_s16(s5_27, s5_28);
+ s6_29 = vaddq_s16(s5_26, s5_29);
+ s6_30 = vaddq_s16(s4_25, s4_30);
+ s6_31 = vaddq_s16(s4_24, s4_31);
+
+ // stage 7
+ s7_0 = vaddq_s16(s6_0, s5_15);
+ s7_1 = vaddq_s16(s6_1, s5_14);
+ s7_2 = vaddq_s16(s6_2, s6_13);
+ s7_3 = vaddq_s16(s6_3, s6_12);
+ s7_4 = vaddq_s16(s6_4, s6_11);
+ s7_5 = vaddq_s16(s6_5, s6_10);
+ s7_6 = vaddq_s16(s6_6, s5_9);
+ s7_7 = vaddq_s16(s6_7, s5_8);
+ s7_8 = vsubq_s16(s6_7, s5_8);
+ s7_9 = vsubq_s16(s6_6, s5_9);
+ s7_10 = vsubq_s16(s6_5, s6_10);
+ s7_11 = vsubq_s16(s6_4, s6_11);
+ s7_12 = vsubq_s16(s6_3, s6_12);
+ s7_13 = vsubq_s16(s6_2, s6_13);
+ s7_14 = vsubq_s16(s6_1, s5_14);
+ s7_15 = vsubq_s16(s6_0, s5_15);
+
+ s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
+ s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+
+ s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
+ s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+
+ s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
+ s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+
+ s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
+ s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+
+ // final stage
+ vst1q_s16(output, vaddq_s16(s7_0, s6_31));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_1, s6_30));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_2, s6_29));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_3, s6_28));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_4, s7_27));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_5, s7_26));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_6, s7_25));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_7, s7_24));
+ output += 16;
+
+ vst1q_s16(output, vaddq_s16(s7_8, s7_23));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_9, s7_22));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_10, s7_21));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_11, s7_20));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_12, s6_19));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_13, s6_18));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_14, s6_17));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_15, s6_16));
+ output += 16;
+
+ vst1q_s16(output, vsubq_s16(s7_15, s6_16));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_14, s6_17));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_13, s6_18));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_12, s6_19));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_11, s7_20));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_10, s7_21));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_9, s7_22));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_8, s7_23));
+ output += 16;
+
+ vst1q_s16(output, vsubq_s16(s7_7, s7_24));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_6, s7_25));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_5, s7_26));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_4, s7_27));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_3, s6_28));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_2, s6_29));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_1, s6_30));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_0, s6_31));
+}
+
+static void idct32_16_neon(const int16_t *input, uint8_t *output, int stride) {
+ int16x8_t in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+ in13, in14, in15;
+ int16x8_t s1_16, s1_17, s1_18, s1_19, s1_20, s1_21, s1_22, s1_23, s1_24,
+ s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, s1_31;
+ int16x8_t s2_8, s2_9, s2_10, s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17,
+ s2_18, s2_19, s2_20, s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27,
+ s2_28, s2_29, s2_30, s2_31;
+ int16x8_t s3_4, s3_5, s3_6, s3_7, s3_8, s3_9, s3_10, s3_11, s3_12, s3_13,
+ s3_14, s3_15, s3_17, s3_18, s3_21, s3_22, s3_25, s3_26, s3_29, s3_30;
+ int16x8_t s4_0, s4_2, s4_3, s4_4, s4_5, s4_6, s4_7, s4_9, s4_10, s4_13, s4_14,
+ s4_16, s4_17, s4_18, s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25,
+ s4_26, s4_27, s4_28, s4_29, s4_30, s4_31;
+ int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
+ s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
+ s5_29;
+ int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
+ s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
+ s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
+ int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
+ s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
+ s7_25, s7_26, s7_27;
+ int16x8_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ load_and_transpose_s16_8x8(input, 16, &in0, &in1, &in2, &in3, &in4, &in5,
+ &in6, &in7);
+
+ load_and_transpose_s16_8x8(input + 8, 16, &in8, &in9, &in10, &in11, &in12,
+ &in13, &in14, &in15);
+
+ // stage 1
+ s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+ s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+ s1_17 = multiply_shift_and_narrow_s16(in15, -cospi_17_64);
+ s1_30 = multiply_shift_and_narrow_s16(in15, cospi_15_64);
+
+ s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
+ s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+
+ s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+ s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+
+ s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+ s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+ s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
+ s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+
+ s1_22 = multiply_shift_and_narrow_s16(in13, cospi_19_64);
+ s1_25 = multiply_shift_and_narrow_s16(in13, cospi_13_64);
+
+ s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+ s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+ // stage 2
+ s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+ s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+ s2_9 = multiply_shift_and_narrow_s16(in14, -cospi_18_64);
+ s2_14 = multiply_shift_and_narrow_s16(in14, cospi_14_64);
+
+ s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
+ s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+
+ s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+ s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+
+ s2_16 = vaddq_s16(s1_16, s1_17);
+ s2_17 = vsubq_s16(s1_16, s1_17);
+ s2_18 = vsubq_s16(s1_19, s1_18);
+ s2_19 = vaddq_s16(s1_18, s1_19);
+ s2_20 = vaddq_s16(s1_20, s1_21);
+ s2_21 = vsubq_s16(s1_20, s1_21);
+ s2_22 = vsubq_s16(s1_23, s1_22);
+ s2_23 = vaddq_s16(s1_22, s1_23);
+ s2_24 = vaddq_s16(s1_24, s1_25);
+ s2_25 = vsubq_s16(s1_24, s1_25);
+ s2_26 = vsubq_s16(s1_27, s1_26);
+ s2_27 = vaddq_s16(s1_26, s1_27);
+ s2_28 = vaddq_s16(s1_28, s1_29);
+ s2_29 = vsubq_s16(s1_28, s1_29);
+ s2_30 = vsubq_s16(s1_31, s1_30);
+ s2_31 = vaddq_s16(s1_30, s1_31);
+
+ // stage 3
+ s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+ s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+ s3_5 = multiply_shift_and_narrow_s16(in12, -cospi_20_64);
+ s3_6 = multiply_shift_and_narrow_s16(in12, cospi_12_64);
+
+ s3_8 = vaddq_s16(s2_8, s2_9);
+ s3_9 = vsubq_s16(s2_8, s2_9);
+ s3_10 = vsubq_s16(s2_11, s2_10);
+ s3_11 = vaddq_s16(s2_10, s2_11);
+ s3_12 = vaddq_s16(s2_12, s2_13);
+ s3_13 = vsubq_s16(s2_12, s2_13);
+ s3_14 = vsubq_s16(s2_15, s2_14);
+ s3_15 = vaddq_s16(s2_14, s2_15);
+
+ s3_17 = multiply_accumulate_shift_and_narrow_s16(s2_17, -cospi_4_64, s2_30,
+ cospi_28_64);
+ s3_30 = multiply_accumulate_shift_and_narrow_s16(s2_17, cospi_28_64, s2_30,
+ cospi_4_64);
+
+ s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
+ -cospi_4_64);
+ s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
+ cospi_28_64);
+
+ s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
+ cospi_12_64);
+ s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
+ cospi_20_64);
+
+ s3_22 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_12_64, s2_25,
+ -cospi_20_64);
+ s3_25 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_20_64, s2_25,
+ cospi_12_64);
+
+ // stage 4
+ s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+ s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
+ s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+
+ s4_4 = vaddq_s16(s3_4, s3_5);
+ s4_5 = vsubq_s16(s3_4, s3_5);
+ s4_6 = vsubq_s16(s3_7, s3_6);
+ s4_7 = vaddq_s16(s3_6, s3_7);
+
+ s4_9 = multiply_accumulate_shift_and_narrow_s16(s3_9, -cospi_8_64, s3_14,
+ cospi_24_64);
+ s4_14 = multiply_accumulate_shift_and_narrow_s16(s3_9, cospi_24_64, s3_14,
+ cospi_8_64);
+
+ s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
+ -cospi_8_64);
+ s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+ cospi_24_64);
+
+ s4_16 = vaddq_s16(s2_16, s2_19);
+ s4_17 = vaddq_s16(s3_17, s3_18);
+ s4_18 = vsubq_s16(s3_17, s3_18);
+ s4_19 = vsubq_s16(s2_16, s2_19);
+ s4_20 = vsubq_s16(s2_23, s2_20);
+ s4_21 = vsubq_s16(s3_22, s3_21);
+ s4_22 = vaddq_s16(s3_21, s3_22);
+ s4_23 = vaddq_s16(s2_20, s2_23);
+ s4_24 = vaddq_s16(s2_24, s2_27);
+ s4_25 = vaddq_s16(s3_25, s3_26);
+ s4_26 = vsubq_s16(s3_25, s3_26);
+ s4_27 = vsubq_s16(s2_24, s2_27);
+ s4_28 = vsubq_s16(s2_31, s2_28);
+ s4_29 = vsubq_s16(s3_30, s3_29);
+ s4_30 = vaddq_s16(s3_29, s3_30);
+ s4_31 = vaddq_s16(s2_28, s2_31);
+
+ // stage 5
+ s5_0 = vaddq_s16(s4_0, s4_3);
+ s5_1 = vaddq_s16(s4_0, s4_2);
+ s5_2 = vsubq_s16(s4_0, s4_2);
+ s5_3 = vsubq_s16(s4_0, s4_3);
+
+ s5_5 = sub_multiply_shift_and_narrow_s16(s4_6, s4_5, cospi_16_64);
+ s5_6 = add_multiply_shift_and_narrow_s16(s4_5, s4_6, cospi_16_64);
+
+ s5_8 = vaddq_s16(s3_8, s3_11);
+ s5_9 = vaddq_s16(s4_9, s4_10);
+ s5_10 = vsubq_s16(s4_9, s4_10);
+ s5_11 = vsubq_s16(s3_8, s3_11);
+ s5_12 = vsubq_s16(s3_15, s3_12);
+ s5_13 = vsubq_s16(s4_14, s4_13);
+ s5_14 = vaddq_s16(s4_13, s4_14);
+ s5_15 = vaddq_s16(s3_15, s3_12);
+
+ s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
+ cospi_24_64);
+ s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
+ cospi_8_64);
+
+ s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
+ cospi_24_64);
+ s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
+ cospi_8_64);
+
+ s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
+ -cospi_8_64);
+ s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
+ cospi_24_64);
+
+ s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
+ -cospi_8_64);
+ s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
+ cospi_24_64);
+
+ // stage 6
+ s6_0 = vaddq_s16(s5_0, s4_7);
+ s6_1 = vaddq_s16(s5_1, s5_6);
+ s6_2 = vaddq_s16(s5_2, s5_5);
+ s6_3 = vaddq_s16(s5_3, s4_4);
+ s6_4 = vsubq_s16(s5_3, s4_4);
+ s6_5 = vsubq_s16(s5_2, s5_5);
+ s6_6 = vsubq_s16(s5_1, s5_6);
+ s6_7 = vsubq_s16(s5_0, s4_7);
+
+ s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
+ s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+
+ s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
+ s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+
+ s6_16 = vaddq_s16(s4_16, s4_23);
+ s6_17 = vaddq_s16(s4_17, s4_22);
+ s6_18 = vaddq_s16(s5_18, s5_21);
+ s6_19 = vaddq_s16(s5_19, s5_20);
+ s6_20 = vsubq_s16(s5_19, s5_20);
+ s6_21 = vsubq_s16(s5_18, s5_21);
+ s6_22 = vsubq_s16(s4_17, s4_22);
+ s6_23 = vsubq_s16(s4_16, s4_23);
+ s6_24 = vsubq_s16(s4_31, s4_24);
+ s6_25 = vsubq_s16(s4_30, s4_25);
+ s6_26 = vsubq_s16(s5_29, s5_26);
+ s6_27 = vsubq_s16(s5_28, s5_27);
+ s6_28 = vaddq_s16(s5_27, s5_28);
+ s6_29 = vaddq_s16(s5_26, s5_29);
+ s6_30 = vaddq_s16(s4_25, s4_30);
+ s6_31 = vaddq_s16(s4_24, s4_31);
+
+ // stage 7
+ s7_0 = vaddq_s16(s6_0, s5_15);
+ s7_1 = vaddq_s16(s6_1, s5_14);
+ s7_2 = vaddq_s16(s6_2, s6_13);
+ s7_3 = vaddq_s16(s6_3, s6_12);
+ s7_4 = vaddq_s16(s6_4, s6_11);
+ s7_5 = vaddq_s16(s6_5, s6_10);
+ s7_6 = vaddq_s16(s6_6, s5_9);
+ s7_7 = vaddq_s16(s6_7, s5_8);
+ s7_8 = vsubq_s16(s6_7, s5_8);
+ s7_9 = vsubq_s16(s6_6, s5_9);
+ s7_10 = vsubq_s16(s6_5, s6_10);
+ s7_11 = vsubq_s16(s6_4, s6_11);
+ s7_12 = vsubq_s16(s6_3, s6_12);
+ s7_13 = vsubq_s16(s6_2, s6_13);
+ s7_14 = vsubq_s16(s6_1, s5_14);
+ s7_15 = vsubq_s16(s6_0, s5_15);
+
+ s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
+ s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+
+ s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
+ s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+
+ s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
+ s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+
+ s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
+ s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+
+ // final stage
+ out0 = vaddq_s16(s7_0, s6_31);
+ out1 = vaddq_s16(s7_1, s6_30);
+ out2 = vaddq_s16(s7_2, s6_29);
+ out3 = vaddq_s16(s7_3, s6_28);
+ out4 = vaddq_s16(s7_4, s7_27);
+ out5 = vaddq_s16(s7_5, s7_26);
+ out6 = vaddq_s16(s7_6, s7_25);
+ out7 = vaddq_s16(s7_7, s7_24);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
+ stride);
+
+ out0 = vaddq_s16(s7_8, s7_23);
+ out1 = vaddq_s16(s7_9, s7_22);
+ out2 = vaddq_s16(s7_10, s7_21);
+ out3 = vaddq_s16(s7_11, s7_20);
+ out4 = vaddq_s16(s7_12, s6_19);
+ out5 = vaddq_s16(s7_13, s6_18);
+ out6 = vaddq_s16(s7_14, s6_17);
+ out7 = vaddq_s16(s7_15, s6_16);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+ output + (8 * stride), stride);
+
+ out0 = vsubq_s16(s7_15, s6_16);
+ out1 = vsubq_s16(s7_14, s6_17);
+ out2 = vsubq_s16(s7_13, s6_18);
+ out3 = vsubq_s16(s7_12, s6_19);
+ out4 = vsubq_s16(s7_11, s7_20);
+ out5 = vsubq_s16(s7_10, s7_21);
+ out6 = vsubq_s16(s7_9, s7_22);
+ out7 = vsubq_s16(s7_8, s7_23);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+ output + (16 * stride), stride);
+
+ out0 = vsubq_s16(s7_7, s7_24);
+ out1 = vsubq_s16(s7_6, s7_25);
+ out2 = vsubq_s16(s7_5, s7_26);
+ out3 = vsubq_s16(s7_4, s7_27);
+ out4 = vsubq_s16(s7_3, s6_28);
+ out5 = vsubq_s16(s7_2, s6_29);
+ out6 = vsubq_s16(s7_1, s6_30);
+ out7 = vsubq_s16(s7_0, s6_31);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+ output + (24 * stride), stride);
+}
+
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ int16_t temp[32 * 16];
+ int16_t *t = temp;
+
+ idct32_12_neon(input, temp);
+ idct32_12_neon(input + 32 * 8, temp + 8);
+
+ for (i = 0; i < 32; i += 8) {
+ idct32_16_neon(t, dest, stride);
+ t += (16 * 8);
+ dest += 8;
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
index 6be4b01229b..604d82abd18 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -10,127 +10,48 @@
#include <arm_neon.h>
-#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
- uint8x16_t *q9u8, uint8x16_t *q10u8,
- uint8x16_t *q11u8, uint8x16_t *q12u8,
- uint8x16_t *q13u8, uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vld1q_u8(d);
- d += d_stride;
- *q9u8 = vld1q_u8(d);
- d += d_stride;
- *q10u8 = vld1q_u8(d);
- d += d_stride;
- *q11u8 = vld1q_u8(d);
- d += d_stride;
- *q12u8 = vld1q_u8(d);
- d += d_stride;
- *q13u8 = vld1q_u8(d);
- d += d_stride;
- *q14u8 = vld1q_u8(d);
- d += d_stride;
- *q15u8 = vld1q_u8(d);
+static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a0 = vld1q_u8(*dest);
+ const uint8x16_t a1 = vld1q_u8(*dest + 16);
+ const uint8x16_t b0 = vqaddq_u8(a0, res);
+ const uint8x16_t b1 = vqaddq_u8(a1, res);
+ vst1q_u8(*dest, b0);
+ vst1q_u8(*dest + 16, b1);
+ *dest += stride;
}
-static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
- uint8x16_t *q9u8, uint8x16_t *q10u8,
- uint8x16_t *q11u8, uint8x16_t *q12u8,
- uint8x16_t *q13u8, uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
- *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
- *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
- *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
- *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
- *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
- *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
- *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
-}
-
-static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
- uint8x16_t *q9u8, uint8x16_t *q10u8,
- uint8x16_t *q11u8, uint8x16_t *q12u8,
- uint8x16_t *q13u8, uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
- *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
- *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
- *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
- *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
- *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
- *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
- *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
-}
-
-static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
- uint8x16_t *q9u8, uint8x16_t *q10u8,
- uint8x16_t *q11u8, uint8x16_t *q12u8,
- uint8x16_t *q13u8, uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- vst1q_u8(d, *q8u8);
- d += d_stride;
- vst1q_u8(d, *q9u8);
- d += d_stride;
- vst1q_u8(d, *q10u8);
- d += d_stride;
- vst1q_u8(d, *q11u8);
- d += d_stride;
- vst1q_u8(d, *q12u8);
- d += d_stride;
- vst1q_u8(d, *q13u8);
- d += d_stride;
- vst1q_u8(d, *q14u8);
- d += d_stride;
- vst1q_u8(d, *q15u8);
+static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a0 = vld1q_u8(*dest);
+ const uint8x16_t a1 = vld1q_u8(*dest + 16);
+ const uint8x16_t b0 = vqsubq_u8(a0, res);
+ const uint8x16_t b1 = vqsubq_u8(a1, res);
+ vst1q_u8(*dest, b0);
+ vst1q_u8(*dest + 16, b1);
+ *dest += stride;
}
void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
- uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
- int i, j, dest_stride8;
- uint8_t *d;
- int16_t a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- dest_stride8 = dest_stride * 8;
- if (a1 >= 0) { // diff_positive_32_32
- a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
- q0u8 = vdupq_n_u8((uint8_t)a1);
- for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
- d = dest;
- for (j = 0; j < 4; j++) {
- LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
- ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
- ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
- d += dest_stride8;
- }
+ int stride) {
+ int i;
+ const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+
+ if (a1 >= 0) {
+ const uint8x16_t dc = create_dcq(a1);
+ for (i = 0; i < 32; i++) {
+ idct32x32_1_add_pos_kernel(&dest, stride, dc);
}
- } else { // diff_negative_32_32
- a1 = -a1;
- a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
- q0u8 = vdupq_n_u8((uint8_t)a1);
- for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
- d = dest;
- for (j = 0; j < 4; j++) {
- LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
- SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
- ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
- d += dest_stride8;
- }
+ } else {
+ const uint8x16_t dc = create_dcq(-a1);
+ for (i = 0; i < 32; i++) {
+ idct32x32_1_add_neg_kernel(&dest, stride, dc);
}
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
index ebec9df54ad..b56deeea6de 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
// Only for the first pass of the _34_ variant. Since it only uses values from
@@ -34,7 +35,7 @@
// 5 13 20 26
// 6 21 27 33
// 7 24 32
-static void idct32_6_neon(const int16_t *input, int16_t *output) {
+static void idct32_6_neon(const tran_low_t *input, int16_t *output) {
int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
@@ -46,8 +47,22 @@ static void idct32_6_neon(const int16_t *input, int16_t *output) {
s2_31;
int16x8_t s3_24, s3_25, s3_26, s3_27;
- load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5,
- &in6, &in7);
+ in0 = load_tran_low_to_s16q(input);
+ input += 32;
+ in1 = load_tran_low_to_s16q(input);
+ input += 32;
+ in2 = load_tran_low_to_s16q(input);
+ input += 32;
+ in3 = load_tran_low_to_s16q(input);
+ input += 32;
+ in4 = load_tran_low_to_s16q(input);
+ input += 32;
+ in5 = load_tran_low_to_s16q(input);
+ input += 32;
+ in6 = load_tran_low_to_s16q(input);
+ input += 32;
+ in7 = load_tran_low_to_s16q(input);
+ transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
// stage 1
// input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
@@ -503,7 +518,7 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
output + (24 * stride), stride);
}
-void vpx_idct32x32_34_add_neon(const int16_t *input, uint8_t *dest,
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
int i;
int16_t temp[32 * 8];
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
index 4eff9b970d9..de1bf978750 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
@@ -12,6 +12,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
@@ -146,55 +147,101 @@ static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
q11s32 = vaddq_s32(q12s32, q11s32);
q10s32 = vaddq_s32(q10s32, q15s32);
- *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
- *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
+ *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14));
+ *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14));
+}
+
+static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1,
+ int16x8_t *s2, int16x8_t *s3, int16x8_t *s4,
+ int16x8_t *s5, int16x8_t *s6, int16x8_t *s7) {
+ *s0 = vld1q_s16(in);
+ in += 32;
+ *s1 = vld1q_s16(in);
+ in += 32;
+ *s2 = vld1q_s16(in);
+ in += 32;
+ *s3 = vld1q_s16(in);
+ in += 32;
+ *s4 = vld1q_s16(in);
+ in += 32;
+ *s5 = vld1q_s16(in);
+ in += 32;
+ *s6 = vld1q_s16(in);
+ in += 32;
+ *s7 = vld1q_s16(in);
+}
+
+static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1,
+ int16x8_t a2, int16x8_t a3,
+ int16x8_t a4, int16x8_t a5,
+ int16x8_t a6, int16x8_t a7,
+ int16_t **out) {
+ transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ vst1q_s16(*out, a0);
+ *out += 8;
+ vst1q_s16(*out, a1);
+ *out += 8;
+ vst1q_s16(*out, a2);
+ *out += 8;
+ vst1q_s16(*out, a3);
+ *out += 8;
+ vst1q_s16(*out, a4);
+ *out += 8;
+ vst1q_s16(*out, a5);
+ *out += 8;
+ vst1q_s16(*out, a6);
+ *out += 8;
+ vst1q_s16(*out, a7);
+ *out += 8;
}
static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) {
- const int16_t *in;
int i;
- const int stride = 32;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ for (i = 0; i < 4; i++, input += 8) {
+ load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void load_s16x8q_tran_low(const tran_low_t *in, int16x8_t *s0,
+ int16x8_t *s1, int16x8_t *s2,
+ int16x8_t *s3, int16x8_t *s4,
+ int16x8_t *s5, int16x8_t *s6,
+ int16x8_t *s7) {
+ *s0 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s1 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s2 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s3 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s4 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s5 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s6 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s7 = load_tran_low_to_s16q(in);
+}
+
+static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input,
+ int16_t *t_buf) {
+ int i;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
for (i = 0; i < 4; i++, input += 8) {
- in = input;
- q8s16 = vld1q_s16(in);
- in += stride;
- q9s16 = vld1q_s16(in);
- in += stride;
- q10s16 = vld1q_s16(in);
- in += stride;
- q11s16 = vld1q_s16(in);
- in += stride;
- q12s16 = vld1q_s16(in);
- in += stride;
- q13s16 = vld1q_s16(in);
- in += stride;
- q14s16 = vld1q_s16(in);
- in += stride;
- q15s16 = vld1q_s16(in);
-
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
- &q14s16, &q15s16);
-
- vst1q_s16(t_buf, q8s16);
- t_buf += 8;
- vst1q_s16(t_buf, q9s16);
- t_buf += 8;
- vst1q_s16(t_buf, q10s16);
- t_buf += 8;
- vst1q_s16(t_buf, q11s16);
- t_buf += 8;
- vst1q_s16(t_buf, q12s16);
- t_buf += 8;
- vst1q_s16(t_buf, q13s16);
- t_buf += 8;
- vst1q_s16(t_buf, q14s16);
- t_buf += 8;
- vst1q_s16(t_buf, q15s16);
- t_buf += 8;
+ load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
}
}
+#else // !CONFIG_VP9_HIGHBITDEPTH
+#define idct32_transpose_pair_tran_low idct32_transpose_pair
+#endif // CONFIG_VP9_HIGHBITDEPTH
static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
int16x8_t q3s16, int16x8_t q6s16,
@@ -383,16 +430,21 @@ void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
int16_t trans_buf[32 * 8];
int16_t pass1[32 * 32];
int16_t pass2[32 * 32];
+ const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1
int16_t *out;
int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
- idct32_pass_loop++,
- input = pass1, // the input of pass2 is the result of pass1
- out = pass2) {
- for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop
- idct32_transpose_pair(input, trans_buf);
+ idct32_pass_loop++, out = pass2) {
+ for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop
+ if (idct32_pass_loop == 0) {
+ idct32_transpose_pair_tran_low(input, trans_buf);
+ input += 32 * 8;
+ } else {
+ idct32_transpose_pair(input_pass2, trans_buf);
+ input_pass2 += 32 * 8;
+ }
// -----------------------------------------
// BLOCK A: 16-19,28-31
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
index cbfab361af8..d83421e9e66 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
@@ -15,12 +15,11 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride)
+;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct4x4_1_add_neon| PROC
ldrsh r0, [r0]
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
index 525aac05a84..d1eae24a222 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -9,39 +9,37 @@
*/
#include <arm_neon.h>
+#include <assert.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
- uint8x8_t d6u8;
- uint32x2_t d2u32 = vdup_n_u32(0);
- uint16x8_t q8u16;
- int16x8_t q0s16;
- uint8_t *d1, *d2;
- int16_t i, a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 4);
-
- q0s16 = vdupq_n_s16(a1);
+static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
+ const int16x8_t res,
+ uint32x2_t *const d) {
+ uint16x8_t a;
+ uint8x8_t b;
+ *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0);
+ *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1);
+ a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d));
+ b = vqmovun_s16(vreinterpretq_s16_u16(a));
+ vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0);
+ *dest += stride;
+ vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1);
+ *dest += stride;
+}
- // dc_only_idct_add
- d1 = d2 = dest;
- for (i = 0; i < 2; i++) {
- d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
- d1 += dest_stride;
- d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
- d1 += dest_stride;
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ uint32x2_t d = vdup_n_u32(0);
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
- d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ assert(!((intptr_t)dest % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
- vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
- d2 += dest_stride;
- vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
- d2 += dest_stride;
- }
+ idct4x4_1_add_kernel(&dest, stride, dc, &d);
+ idct4x4_1_add_kernel(&dest, stride, dc, &d);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
index bd4e86ded25..184d218941c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -18,11 +18,11 @@
INCLUDE vpx_dsp/arm/idct_neon.asm.S
AREA Block, CODE, READONLY ; name this block of code
-;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct4x4_16_add_neon| PROC
@@ -72,16 +72,15 @@
; do the transform on transposed rows
; stage 1
- vadd.s16 d23, d16, d18 ; (input[0] + input[2])
- vsub.s16 d24, d16, d18 ; (input[0] - input[2])
-
vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
; (input[0] + input[2]) * cospi_16_64;
; (input[0] - input[2]) * cospi_16_64;
- vmull.s16 q13, d23, d21
- vmull.s16 q14, d24, d21
+ vmull.s16 q8, d16, d21
+ vmull.s16 q14, d18, d21
+ vadd.s32 q13, q8, q14
+ vsub.s32 q14, q8, q14
; input[1] * cospi_24_64 - input[3] * cospi_8_64;
; input[1] * cospi_8_64 + input[3] * cospi_24_64;
@@ -89,10 +88,10 @@
vmlal.s16 q1, d19, d22
; dct_const_round_shift
- vqrshrn.s32 d26, q13, #14
- vqrshrn.s32 d27, q14, #14
- vqrshrn.s32 d29, q15, #14
- vqrshrn.s32 d28, q1, #14
+ vrshrn.s32 d26, q13, #14
+ vrshrn.s32 d27, q14, #14
+ vrshrn.s32 d29, q15, #14
+ vrshrn.s32 d28, q1, #14
; stage 2
; output[0] = step[0] + step[3];
@@ -140,10 +139,10 @@
vmlal.s16 q1, d19, d22
; dct_const_round_shift
- vqrshrn.s32 d26, q13, #14
- vqrshrn.s32 d27, q14, #14
- vqrshrn.s32 d29, q15, #14
- vqrshrn.s32 d28, q1, #14
+ vrshrn.s32 d26, q13, #14
+ vrshrn.s32 d27, q14, #14
+ vrshrn.s32 d29, q15, #14
+ vrshrn.s32 d28, q1, #14
; stage 2
; output[0] = step[0] + step[3];
@@ -168,7 +167,7 @@
vld1.32 {d27[1]}, [r1], r2
vld1.32 {d27[0]}, [r1] ; no post-increment
- ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]
vaddw.u8 q8, q8, d26
vaddw.u8 q9, q9, d27
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
index 8f669c90765..bff98cbc169 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
@@ -9,139 +9,56 @@
*/
#include <arm_neon.h>
+#include <assert.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/txfm_common.h"
void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
- uint8x8_t d26u8, d27u8;
- uint32x2_t d26u32, d27u32;
- uint16x8_t q8u16, q9u16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
- int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
- int16x8_t q8s16, q9s16, q13s16, q14s16;
- int32x4_t q1s32, q13s32, q14s32, q15s32;
- int16x4x2_t d0x2s16, d1x2s16;
- int32x4x2_t q0x2s32;
- uint8_t *d;
-
- d26u32 = d27u32 = vdup_n_u32(0);
-
- q8s16 = load_tran_low_to_s16(input);
- q9s16 = load_tran_low_to_s16(input + 8);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_low_s16(q9s16);
- d19s16 = vget_high_s16(q9s16);
-
- d0x2s16 = vtrn_s16(d16s16, d17s16);
- d1x2s16 = vtrn_s16(d18s16, d19s16);
- q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
- q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
- d20s16 = vdup_n_s16((int16_t)cospi_8_64);
- d21s16 = vdup_n_s16((int16_t)cospi_16_64);
-
- q0x2s32 =
- vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
- d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
- d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
- d22s16 = vdup_n_s16((int16_t)cospi_24_64);
-
- // stage 1
- d23s16 = vadd_s16(d16s16, d18s16);
- d24s16 = vsub_s16(d16s16, d18s16);
-
- q15s32 = vmull_s16(d17s16, d22s16);
- q1s32 = vmull_s16(d17s16, d20s16);
- q13s32 = vmull_s16(d23s16, d21s16);
- q14s32 = vmull_s16(d24s16, d21s16);
-
- q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
- q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
- d26s16 = vqrshrn_n_s32(q13s32, 14);
- d27s16 = vqrshrn_n_s32(q14s32, 14);
- d29s16 = vqrshrn_n_s32(q15s32, 14);
- d28s16 = vqrshrn_n_s32(q1s32, 14);
- q13s16 = vcombine_s16(d26s16, d27s16);
- q14s16 = vcombine_s16(d28s16, d29s16);
-
- // stage 2
- q8s16 = vaddq_s16(q13s16, q14s16);
- q9s16 = vsubq_s16(q13s16, q14s16);
-
- d16s16 = vget_low_s16(q8s16);
- d17s16 = vget_high_s16(q8s16);
- d18s16 = vget_high_s16(q9s16); // vswp d18 d19
- d19s16 = vget_low_s16(q9s16);
-
- d0x2s16 = vtrn_s16(d16s16, d17s16);
- d1x2s16 = vtrn_s16(d18s16, d19s16);
- q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
- q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
- q0x2s32 =
- vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
- d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
- d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
- d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
- // do the transform on columns
- // stage 1
- d23s16 = vadd_s16(d16s16, d18s16);
- d24s16 = vsub_s16(d16s16, d18s16);
-
- q15s32 = vmull_s16(d17s16, d22s16);
- q1s32 = vmull_s16(d17s16, d20s16);
- q13s32 = vmull_s16(d23s16, d21s16);
- q14s32 = vmull_s16(d24s16, d21s16);
-
- q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
- q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
- d26s16 = vqrshrn_n_s32(q13s32, 14);
- d27s16 = vqrshrn_n_s32(q14s32, 14);
- d29s16 = vqrshrn_n_s32(q15s32, 14);
- d28s16 = vqrshrn_n_s32(q1s32, 14);
- q13s16 = vcombine_s16(d26s16, d27s16);
- q14s16 = vcombine_s16(d28s16, d29s16);
-
- // stage 2
- q8s16 = vaddq_s16(q13s16, q14s16);
- q9s16 = vsubq_s16(q13s16, q14s16);
-
- q8s16 = vrshrq_n_s16(q8s16, 4);
- q9s16 = vrshrq_n_s16(q9s16, 4);
-
- d = dest;
- d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
- d += dest_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
- d += dest_stride;
- d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
- d += dest_stride;
- d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
- d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
- d = dest;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
- d += dest_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
- d += dest_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
- d += dest_stride;
- vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+ int stride) {
+ const uint8_t *dst = dest;
+ const int16x4_t cospis = vld1_s16(kCospi);
+ uint32x2_t dest01_u32 = vdup_n_u32(0);
+ uint32x2_t dest32_u32 = vdup_n_u32(0);
+ int16x8_t a0, a1;
+ uint8x8_t d01, d32;
+ uint16x8_t d01_u16, d32_u16;
+
+ assert(!((intptr_t)dest % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ // Rows
+ a0 = load_tran_low_to_s16q(input);
+ a1 = load_tran_low_to_s16q(input + 8);
+ idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+
+ // Columns
+ a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
+ idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+ a0 = vrshrq_n_s16(a0, 4);
+ a1 = vrshrq_n_s16(a1, 4);
+
+ dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0);
+ dst += stride;
+ dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1);
+ dst += stride;
+ dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1);
+ dst += stride;
+ dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0);
+
+ d01_u16 =
+ vaddw_u8(vreinterpretq_u16_s16(a0), vreinterpret_u8_u32(dest01_u32));
+ d32_u16 =
+ vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32));
+ d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16));
+ d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16));
+
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0);
+ dest += stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1);
+ dest += stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1);
+ dest += stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
index e4531c6e97f..29f678a0382 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
@@ -15,12 +15,11 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride)
+;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct8x8_1_add_neon| PROC
ldrsh r0, [r0]
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
index eee41e6c6b1..7bcce913bdb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -12,51 +12,53 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
- uint8x8_t d2u8, d3u8, d30u8, d31u8;
- uint64x1_t d2u64, d3u64, d4u64, d5u64;
- uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
- int16x8_t q0s16;
- uint8_t *d1, *d2;
- int16_t i, a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 5);
-
- q0s16 = vdupq_n_s16(a1);
- q0u16 = vreinterpretq_u16_s16(q0s16);
+static INLINE uint8x8_t create_dcd(const int16_t dc) {
+ int16x8_t t = vdupq_n_s16(dc);
+ return vqmovun_s16(t);
+}
- d1 = d2 = dest;
- for (i = 0; i < 2; i++) {
- d2u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
- d4u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
- d5u64 = vld1_u64((const uint64_t *)d1);
- d1 += dest_stride;
+static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x8_t res) {
+ const uint8x8_t a = vld1_u8(*dest);
+ const uint8x8_t b = vqadd_u8(a, res);
+ vst1_u8(*dest, b);
+ *dest += stride;
+}
- q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
- q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
- q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
- q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x8_t res) {
+ const uint8x8_t a = vld1_u8(*dest);
+ const uint8x8_t b = vqsub_u8(a, res);
+ vst1_u8(*dest, b);
+ *dest += stride;
+}
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
- d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
- d2 += dest_stride;
+ if (a1 >= 0) {
+ const uint8x8_t dc = create_dcd(a1);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ } else {
+ const uint8x8_t dc = create_dcd(-a1);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
index a5c9c927d67..2bfbcc5a52c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
@@ -47,12 +47,12 @@
vmlsl.s16 q6, d23, d3
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d8, q2, #14 ; >> 14
- vqrshrn.s32 d9, q3, #14 ; >> 14
+ vrshrn.s32 d8, q2, #14 ; >> 14
+ vrshrn.s32 d9, q3, #14 ; >> 14
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q5, #14 ; >> 14
- vqrshrn.s32 d11, q6, #14 ; >> 14
+ vrshrn.s32 d10, q5, #14 ; >> 14
+ vrshrn.s32 d11, q6, #14 ; >> 14
; input[1] * cospi_4_64
vmull.s16 q2, d18, d1
@@ -71,15 +71,15 @@
vmlal.s16 q13, d23, d2
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d14, q2, #14 ; >> 14
- vqrshrn.s32 d15, q3, #14 ; >> 14
+ vrshrn.s32 d14, q2, #14 ; >> 14
+ vrshrn.s32 d15, q3, #14 ; >> 14
; stage 2 & stage 3 - even half
vdup.16 d0, r7 ; duplicate cospi_16_64
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q13, #14 ; >> 14
+ vrshrn.s32 d12, q9, #14 ; >> 14
+ vrshrn.s32 d13, q13, #14 ; >> 14
; input[0] * cospi_16_64
vmull.s16 q2, d16, d0
@@ -101,12 +101,12 @@
vdup.16 d1, r9 ; duplicate cospi_8_64
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d18, q2, #14 ; >> 14
- vqrshrn.s32 d19, q3, #14 ; >> 14
+ vrshrn.s32 d18, q2, #14 ; >> 14
+ vrshrn.s32 d19, q3, #14 ; >> 14
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d22, q13, #14 ; >> 14
- vqrshrn.s32 d23, q15, #14 ; >> 14
+ vrshrn.s32 d22, q13, #14 ; >> 14
+ vrshrn.s32 d23, q15, #14 ; >> 14
; input[1] * cospi_24_64 - input[3] * cospi_8_64
; input[1] * cospi_24_64
@@ -126,12 +126,12 @@
vmlal.s16 q12, d29, d0
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d26, q2, #14 ; >> 14
- vqrshrn.s32 d27, q3, #14 ; >> 14
+ vrshrn.s32 d26, q2, #14 ; >> 14
+ vrshrn.s32 d27, q3, #14 ; >> 14
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d30, q8, #14 ; >> 14
- vqrshrn.s32 d31, q12, #14 ; >> 14
+ vrshrn.s32 d30, q8, #14 ; >> 14
+ vrshrn.s32 d31, q12, #14 ; >> 14
vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
@@ -164,12 +164,12 @@
vmlal.s16 q12, d27, d16
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q9, #14 ; >> 14
- vqrshrn.s32 d11, q10, #14 ; >> 14
+ vrshrn.s32 d10, q9, #14 ; >> 14
+ vrshrn.s32 d11, q10, #14 ; >> 14
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q12, #14 ; >> 14
+ vrshrn.s32 d12, q11, #14 ; >> 14
+ vrshrn.s32 d13, q12, #14 ; >> 14
; stage 4
vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
@@ -200,11 +200,11 @@
MEND
AREA Block, CODE, READONLY ; name this block of code
-;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct8x8_64_add_neon| PROC
push {r4-r9}
@@ -270,7 +270,7 @@
vld1.64 {d6}, [r1], r2
vld1.64 {d7}, [r1]
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]
vaddw.u8 q8, q8, d0
vaddw.u8 q9, q9, d1
vaddw.u8 q10, q10, d2
@@ -305,11 +305,11 @@
bx lr
ENDP ; |vpx_idct8x8_64_add_neon|
-;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int stride)
;
; r0 int16_t input
; r1 uint8_t *dest
-; r2 int dest_stride)
+; r2 int stride)
|vpx_idct8x8_12_add_neon| PROC
push {r4-r9}
@@ -423,12 +423,12 @@
vmlal.s16 q12, d27, d16
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q9, #14 ; >> 14
- vqrshrn.s32 d11, q10, #14 ; >> 14
+ vrshrn.s32 d10, q9, #14 ; >> 14
+ vrshrn.s32 d11, q10, #14 ; >> 14
; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q12, #14 ; >> 14
+ vrshrn.s32 d12, q11, #14 ; >> 14
+ vrshrn.s32 d13, q12, #14 ; >> 14
; stage 4
vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
@@ -469,7 +469,7 @@
vld1.64 {d6}, [r1], r2
vld1.64 {d7}, [r1]
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]
vaddw.u8 q8, q8, d0
vaddw.u8 q9, q9, d1
vaddw.u8 q10, q10, d2
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
index 159a6ec9891..279da67d74f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
@@ -16,431 +16,111 @@
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
- int16x8_t *q10s16, int16x8_t *q11s16,
- int16x8_t *q12s16, int16x8_t *q13s16,
- int16x8_t *q14s16, int16x8_t *q15s16) {
- int16x4_t d0s16, d1s16, d2s16, d3s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
- d0s16 = vdup_n_s16((int16_t)cospi_28_64);
- d1s16 = vdup_n_s16((int16_t)cospi_4_64);
- d2s16 = vdup_n_s16((int16_t)cospi_12_64);
- d3s16 = vdup_n_s16((int16_t)cospi_20_64);
-
- d16s16 = vget_low_s16(*q8s16);
- d17s16 = vget_high_s16(*q8s16);
- d18s16 = vget_low_s16(*q9s16);
- d19s16 = vget_high_s16(*q9s16);
- d20s16 = vget_low_s16(*q10s16);
- d21s16 = vget_high_s16(*q10s16);
- d22s16 = vget_low_s16(*q11s16);
- d23s16 = vget_high_s16(*q11s16);
- d24s16 = vget_low_s16(*q12s16);
- d25s16 = vget_high_s16(*q12s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
- d30s16 = vget_low_s16(*q15s16);
- d31s16 = vget_high_s16(*q15s16);
-
- q2s32 = vmull_s16(d18s16, d0s16);
- q3s32 = vmull_s16(d19s16, d0s16);
- q5s32 = vmull_s16(d26s16, d2s16);
- q6s32 = vmull_s16(d27s16, d2s16);
-
- q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
- q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
- q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
- q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
- d8s16 = vqrshrn_n_s32(q2s32, 14);
- d9s16 = vqrshrn_n_s32(q3s32, 14);
- d10s16 = vqrshrn_n_s32(q5s32, 14);
- d11s16 = vqrshrn_n_s32(q6s32, 14);
- q4s16 = vcombine_s16(d8s16, d9s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- q2s32 = vmull_s16(d18s16, d1s16);
- q3s32 = vmull_s16(d19s16, d1s16);
- q9s32 = vmull_s16(d26s16, d3s16);
- q13s32 = vmull_s16(d27s16, d3s16);
-
- q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
- q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
- q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
- q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
- d14s16 = vqrshrn_n_s32(q2s32, 14);
- d15s16 = vqrshrn_n_s32(q3s32, 14);
- d12s16 = vqrshrn_n_s32(q9s32, 14);
- d13s16 = vqrshrn_n_s32(q13s32, 14);
- q6s16 = vcombine_s16(d12s16, d13s16);
- q7s16 = vcombine_s16(d14s16, d15s16);
-
- d0s16 = vdup_n_s16((int16_t)cospi_16_64);
-
- q2s32 = vmull_s16(d16s16, d0s16);
- q3s32 = vmull_s16(d17s16, d0s16);
- q13s32 = vmull_s16(d16s16, d0s16);
- q15s32 = vmull_s16(d17s16, d0s16);
-
- q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
- q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
- q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
- q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
- d0s16 = vdup_n_s16((int16_t)cospi_24_64);
- d1s16 = vdup_n_s16((int16_t)cospi_8_64);
-
- d18s16 = vqrshrn_n_s32(q2s32, 14);
- d19s16 = vqrshrn_n_s32(q3s32, 14);
- d22s16 = vqrshrn_n_s32(q13s32, 14);
- d23s16 = vqrshrn_n_s32(q15s32, 14);
- *q9s16 = vcombine_s16(d18s16, d19s16);
- *q11s16 = vcombine_s16(d22s16, d23s16);
-
- q2s32 = vmull_s16(d20s16, d0s16);
- q3s32 = vmull_s16(d21s16, d0s16);
- q8s32 = vmull_s16(d20s16, d1s16);
- q12s32 = vmull_s16(d21s16, d1s16);
-
- q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
- q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
- q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
- q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
- d26s16 = vqrshrn_n_s32(q2s32, 14);
- d27s16 = vqrshrn_n_s32(q3s32, 14);
- d30s16 = vqrshrn_n_s32(q8s32, 14);
- d31s16 = vqrshrn_n_s32(q12s32, 14);
- *q13s16 = vcombine_s16(d26s16, d27s16);
- *q15s16 = vcombine_s16(d30s16, d31s16);
-
- q0s16 = vaddq_s16(*q9s16, *q15s16);
- q1s16 = vaddq_s16(*q11s16, *q13s16);
- q2s16 = vsubq_s16(*q11s16, *q13s16);
- q3s16 = vsubq_s16(*q9s16, *q15s16);
-
- *q13s16 = vsubq_s16(q4s16, q5s16);
- q4s16 = vaddq_s16(q4s16, q5s16);
- *q14s16 = vsubq_s16(q7s16, q6s16);
- q7s16 = vaddq_s16(q7s16, q6s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
-
- d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-
- q9s32 = vmull_s16(d28s16, d16s16);
- q10s32 = vmull_s16(d29s16, d16s16);
- q11s32 = vmull_s16(d28s16, d16s16);
- q12s32 = vmull_s16(d29s16, d16s16);
-
- q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
- q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
- q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
- q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
- d10s16 = vqrshrn_n_s32(q9s32, 14);
- d11s16 = vqrshrn_n_s32(q10s32, 14);
- d12s16 = vqrshrn_n_s32(q11s32, 14);
- d13s16 = vqrshrn_n_s32(q12s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- *q8s16 = vaddq_s16(q0s16, q7s16);
- *q9s16 = vaddq_s16(q1s16, q6s16);
- *q10s16 = vaddq_s16(q2s16, q5s16);
- *q11s16 = vaddq_s16(q3s16, q4s16);
- *q12s16 = vsubq_s16(q3s16, q4s16);
- *q13s16 = vsubq_s16(q2s16, q5s16);
- *q14s16 = vsubq_s16(q1s16, q6s16);
- *q15s16 = vsubq_s16(q0s16, q7s16);
+static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
+ int16x8_t a3, int16x8_t a4, int16x8_t a5,
+ int16x8_t a6, int16x8_t a7, uint8_t *dest,
+ const int stride) {
+ const uint8_t *dst = dest;
+ uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+ uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
+
+ a0 = vrshrq_n_s16(a0, 5);
+ a1 = vrshrq_n_s16(a1, 5);
+ a2 = vrshrq_n_s16(a2, 5);
+ a3 = vrshrq_n_s16(a3, 5);
+ a4 = vrshrq_n_s16(a4, 5);
+ a5 = vrshrq_n_s16(a5, 5);
+ a6 = vrshrq_n_s16(a6, 5);
+ a7 = vrshrq_n_s16(a7, 5);
+
+ d0 = vld1_u8(dst);
+ dst += stride;
+ d1 = vld1_u8(dst);
+ dst += stride;
+ d2 = vld1_u8(dst);
+ dst += stride;
+ d3 = vld1_u8(dst);
+ dst += stride;
+ d4 = vld1_u8(dst);
+ dst += stride;
+ d5 = vld1_u8(dst);
+ dst += stride;
+ d6 = vld1_u8(dst);
+ dst += stride;
+ d7 = vld1_u8(dst);
+
+ d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0);
+ d1_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), d1);
+ d2_u16 = vaddw_u8(vreinterpretq_u16_s16(a2), d2);
+ d3_u16 = vaddw_u8(vreinterpretq_u16_s16(a3), d3);
+ d4_u16 = vaddw_u8(vreinterpretq_u16_s16(a4), d4);
+ d5_u16 = vaddw_u8(vreinterpretq_u16_s16(a5), d5);
+ d6_u16 = vaddw_u8(vreinterpretq_u16_s16(a6), d6);
+ d7_u16 = vaddw_u8(vreinterpretq_u16_s16(a7), d7);
+
+ d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16));
+ d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16));
+ d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16));
+ d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16));
+ d4 = vqmovun_s16(vreinterpretq_s16_u16(d4_u16));
+ d5 = vqmovun_s16(vreinterpretq_s16_u16(d5_u16));
+ d6 = vqmovun_s16(vreinterpretq_s16_u16(d6_u16));
+ d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16));
+
+ vst1_u8(dest, d0);
+ dest += stride;
+ vst1_u8(dest, d1);
+ dest += stride;
+ vst1_u8(dest, d2);
+ dest += stride;
+ vst1_u8(dest, d3);
+ dest += stride;
+ vst1_u8(dest, d4);
+ dest += stride;
+ vst1_u8(dest, d5);
+ dest += stride;
+ vst1_u8(dest, d6);
+ dest += stride;
+ vst1_u8(dest, d7);
}
void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
- uint8_t *d1, *d2;
- uint8x8_t d0u8, d1u8, d2u8, d3u8;
- uint64x1_t d0u64, d1u64, d2u64, d3u64;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
- q8s16 = load_tran_low_to_s16(input);
- q9s16 = load_tran_low_to_s16(input + 8);
- q10s16 = load_tran_low_to_s16(input + 16);
- q11s16 = load_tran_low_to_s16(input + 24);
- q12s16 = load_tran_low_to_s16(input + 32);
- q13s16 = load_tran_low_to_s16(input + 40);
- q14s16 = load_tran_low_to_s16(input + 48);
- q15s16 = load_tran_low_to_s16(input + 56);
-
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- q8s16 = vrshrq_n_s16(q8s16, 5);
- q9s16 = vrshrq_n_s16(q9s16, 5);
- q10s16 = vrshrq_n_s16(q10s16, 5);
- q11s16 = vrshrq_n_s16(q11s16, 5);
- q12s16 = vrshrq_n_s16(q12s16, 5);
- q13s16 = vrshrq_n_s16(q13s16, 5);
- q14s16 = vrshrq_n_s16(q14s16, 5);
- q15s16 = vrshrq_n_s16(q15s16, 5);
-
- d1 = d2 = dest;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
-
- q8s16 = q12s16;
- q9s16 = q13s16;
- q10s16 = q14s16;
- q11s16 = q15s16;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
+ int stride) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
+ int16x8_t a0 = load_tran_low_to_s16q(input);
+ int16x8_t a1 = load_tran_low_to_s16q(input + 8);
+ int16x8_t a2 = load_tran_low_to_s16q(input + 16);
+ int16x8_t a3 = load_tran_low_to_s16q(input + 24);
+ int16x8_t a4 = load_tran_low_to_s16q(input + 32);
+ int16x8_t a5 = load_tran_low_to_s16q(input + 40);
+ int16x8_t a6 = load_tran_low_to_s16q(input + 48);
+ int16x8_t a7 = load_tran_low_to_s16q(input + 56);
+
+ idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+ idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+ add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride);
}
void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
- uint8_t *d1, *d2;
- uint8x8_t d0u8, d1u8, d2u8, d3u8;
- int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
- int16x4_t d26s16, d27s16, d28s16, d29s16;
- uint64x1_t d0u64, d1u64, d2u64, d3u64;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- uint16x8_t q8u16, q9u16, q10u16, q11u16;
- int32x4_t q9s32, q10s32, q11s32, q12s32;
-
- q8s16 = load_tran_low_to_s16(input);
- q9s16 = load_tran_low_to_s16(input + 8);
- q10s16 = load_tran_low_to_s16(input + 16);
- q11s16 = load_tran_low_to_s16(input + 24);
- q12s16 = load_tran_low_to_s16(input + 32);
- q13s16 = load_tran_low_to_s16(input + 40);
- q14s16 = load_tran_low_to_s16(input + 48);
- q15s16 = load_tran_low_to_s16(input + 56);
-
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- // First transform rows
- // stage 1
- q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
- q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
-
- q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-
- q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2);
-
- q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
- q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2);
-
- q5s16 = vqrdmulhq_s16(q11s16, q0s16);
-
- q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
-
- q6s16 = vqrdmulhq_s16(q11s16, q1s16);
-
- // stage 2 & stage 3 - even half
- q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2);
-
- q9s16 = vqrdmulhq_s16(q8s16, q0s16);
-
- q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2);
-
- q13s16 = vqrdmulhq_s16(q10s16, q1s16);
-
- q15s16 = vqrdmulhq_s16(q10s16, q0s16);
-
- // stage 3 -odd half
- q0s16 = vaddq_s16(q9s16, q15s16);
- q1s16 = vaddq_s16(q9s16, q13s16);
- q2s16 = vsubq_s16(q9s16, q13s16);
- q3s16 = vsubq_s16(q9s16, q15s16);
-
- // stage 2 - odd half
- q13s16 = vsubq_s16(q4s16, q5s16);
- q4s16 = vaddq_s16(q4s16, q5s16);
- q14s16 = vsubq_s16(q7s16, q6s16);
- q7s16 = vaddq_s16(q7s16, q6s16);
- d26s16 = vget_low_s16(q13s16);
- d27s16 = vget_high_s16(q13s16);
- d28s16 = vget_low_s16(q14s16);
- d29s16 = vget_high_s16(q14s16);
-
- d16s16 = vdup_n_s16((int16_t)cospi_16_64);
- q9s32 = vmull_s16(d28s16, d16s16);
- q10s32 = vmull_s16(d29s16, d16s16);
- q11s32 = vmull_s16(d28s16, d16s16);
- q12s32 = vmull_s16(d29s16, d16s16);
-
- q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
- q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
- q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
- q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
- d10s16 = vqrshrn_n_s32(q9s32, 14);
- d11s16 = vqrshrn_n_s32(q10s32, 14);
- d12s16 = vqrshrn_n_s32(q11s32, 14);
- d13s16 = vqrshrn_n_s32(q12s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- // stage 4
- q8s16 = vaddq_s16(q0s16, q7s16);
- q9s16 = vaddq_s16(q1s16, q6s16);
- q10s16 = vaddq_s16(q2s16, q5s16);
- q11s16 = vaddq_s16(q3s16, q4s16);
- q12s16 = vsubq_s16(q3s16, q4s16);
- q13s16 = vsubq_s16(q2s16, q5s16);
- q14s16 = vsubq_s16(q1s16, q6s16);
- q15s16 = vsubq_s16(q0s16, q7s16);
-
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- q8s16 = vrshrq_n_s16(q8s16, 5);
- q9s16 = vrshrq_n_s16(q9s16, 5);
- q10s16 = vrshrq_n_s16(q10s16, 5);
- q11s16 = vrshrq_n_s16(q11s16, 5);
- q12s16 = vrshrq_n_s16(q12s16, 5);
- q13s16 = vrshrq_n_s16(q13s16, 5);
- q14s16 = vrshrq_n_s16(q14s16, 5);
- q15s16 = vrshrq_n_s16(q15s16, 5);
-
- d1 = d2 = dest;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
-
- q8s16 = q12s16;
- q9s16 = q13s16;
- q10s16 = q14s16;
- q11s16 = q15s16;
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += dest_stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
- q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
- q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += dest_stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += dest_stride;
+ int stride) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x8_t cospisd = vaddq_s16(cospis, cospis);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24
+ const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28
+ int16x4_t a0, a1, a2, a3, a4, a5, a6, a7;
+ int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+ a0 = load_tran_low_to_s16d(input);
+ a1 = load_tran_low_to_s16d(input + 8);
+ a2 = load_tran_low_to_s16d(input + 16);
+ a3 = load_tran_low_to_s16d(input + 24);
+
+ idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4,
+ &a5, &a6, &a7);
+ idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a0, a1, a2, a3, a4, a5, a6,
+ a7, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
+ add8x8(b0, b1, b2, b3, b4, b5, b6, b7, dest, stride);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm
index f39e8ddd4b4..5dd9bdc7888 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm
@@ -10,8 +10,9 @@
INCLUDE ./vpx_config.asm
- ; Helper function used to load tran_low_t into int16, narrowing if
+ ; Helper functions used to load tran_low_t into int16, narrowing if
; necessary.
+
; $dst0..3 are d registers with the pairs assumed to be contiguous in
; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth.
MACRO
@@ -27,4 +28,19 @@
vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]!
ENDIF
MEND
+
+ ; $dst0..3 are d registers. q0-q3 are used as temporaries in high-bitdepth.
+ MACRO
+ LOAD_TRAN_LOW_TO_S16X2 $dst0, $dst1, $dst2, $dst3, $src
+ IF CONFIG_VP9_HIGHBITDEPTH
+ vld2.s32 {q0,q1}, [$src]!
+ vld2.s32 {q2,q3}, [$src]!
+ vmovn.i32 $dst0, q0
+ vmovn.i32 $dst1, q2
+ vmovn.i32 $dst2, q1
+ vmovn.i32 $dst3, q3
+ ELSE
+ vld2.s16 {$dst0,$dst1,$dst2,$dst3}, [$src]!
+ ENDIF
+ MEND
END
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
index 5c2a53c034f..d9b85223c76 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
@@ -17,10 +17,45 @@
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/vpx_dsp_common.h"
+DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
+ 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
+ 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
+ 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
+ -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */,
+ 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */,
+ 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */,
+ 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */,
+ 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
+};
+
+DECLARE_ALIGNED(16, static const int32_t, kCospi32[8]) = {
+ 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
+ 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
+ 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
+ -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */
+};
+
//------------------------------------------------------------------------------
+// Helper functions used to load tran_low_t into int16, narrowing if necessary.
-// Helper function used to load tran_low_t into int16, narrowing if necessary.
-static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) {
+static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4x2_t v0 = vld2q_s32(buf);
+ const int32x4x2_t v1 = vld2q_s32(buf + 8);
+ const int16x4_t s0 = vmovn_s32(v0.val[0]);
+ const int16x4_t s1 = vmovn_s32(v0.val[1]);
+ const int16x4_t s2 = vmovn_s32(v1.val[0]);
+ const int16x4_t s3 = vmovn_s32(v1.val[1]);
+ int16x8x2_t res;
+ res.val[0] = vcombine_s16(s0, s2);
+ res.val[1] = vcombine_s16(s1, s3);
+ return res;
+#else
+ return vld2q_s16(buf);
+#endif
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
#if CONFIG_VP9_HIGHBITDEPTH
const int32x4_t v0 = vld1q_s32(buf);
const int32x4_t v1 = vld1q_s32(buf + 4);
@@ -32,6 +67,17 @@ static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) {
#endif
}
+static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vld1q_s32(buf);
+ return vmovn_s32(v0);
+#else
+ return vld1_s16(buf);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
// Multiply a by a_const. Saturate, shift and narrow by 14.
static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
const int16_t a_const) {
@@ -85,30 +131,6 @@ static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
}
-static INLINE void load_and_transpose_s16_8x8(const int16_t *a, int a_stride,
- int16x8_t *a0, int16x8_t *a1,
- int16x8_t *a2, int16x8_t *a3,
- int16x8_t *a4, int16x8_t *a5,
- int16x8_t *a6, int16x8_t *a7) {
- *a0 = vld1q_s16(a);
- a += a_stride;
- *a1 = vld1q_s16(a);
- a += a_stride;
- *a2 = vld1q_s16(a);
- a += a_stride;
- *a3 = vld1q_s16(a);
- a += a_stride;
- *a4 = vld1q_s16(a);
- a += a_stride;
- *a5 = vld1q_s16(a);
- a += a_stride;
- *a6 = vld1q_s16(a);
- a += a_stride;
- *a7 = vld1q_s16(a);
-
- transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
-}
-
// Shift the output down by 6 and add it to the destination buffer.
static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
const int16x8_t a2, const int16x8_t a3,
@@ -169,4 +191,354 @@ static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
b += b_stride;
vst1_u8(b, b7);
}
+
+static INLINE uint8x16_t create_dcq(const int16_t dc) {
+ // Clip both sides and gcc may compile to assembly 'usat'.
+ const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc);
+ return vdupq_n_u8((uint8_t)t);
+}
+
+static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
+ int16x8_t *const a0,
+ int16x8_t *const a1) {
+ int16x4_t b0, b1, b2, b3;
+ int32x4_t c0, c1, c2, c3;
+ int16x8_t d0, d1;
+
+ transpose_s16_4x4q(a0, a1);
+ b0 = vget_low_s16(*a0);
+ b1 = vget_high_s16(*a0);
+ b2 = vget_low_s16(*a1);
+ b3 = vget_high_s16(*a1);
+ c0 = vmull_lane_s16(b0, cospis, 2);
+ c2 = vmull_lane_s16(b1, cospis, 2);
+ c1 = vsubq_s32(c0, c2);
+ c0 = vaddq_s32(c0, c2);
+ c2 = vmull_lane_s16(b2, cospis, 3);
+ c3 = vmull_lane_s16(b2, cospis, 1);
+ c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
+ c3 = vmlal_lane_s16(c3, b3, cospis, 3);
+ b0 = vrshrn_n_s32(c0, 14);
+ b1 = vrshrn_n_s32(c1, 14);
+ b2 = vrshrn_n_s32(c2, 14);
+ b3 = vrshrn_n_s32(c3, 14);
+ d0 = vcombine_s16(b0, b1);
+ d1 = vcombine_s16(b3, b2);
+ *a0 = vaddq_s16(d0, d1);
+ *a1 = vsubq_s16(d0, d1);
+}
+
+static INLINE void idct8x8_12_pass1_bd8(
+ const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
+ int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2,
+ int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5,
+ int16x4_t *const io6, int16x4_t *const io7) {
+ int16x4_t step1[8], step2[8];
+ int32x4_t t32[2];
+
+ transpose_s16_4x4d(io0, io1, io2, io3);
+
+ // stage 1
+ step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3);
+ step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2);
+ step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1);
+ step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0);
+
+ // stage 2
+ step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2);
+ step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3);
+ step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1);
+
+ step2[4] = vadd_s16(step1[4], step1[5]);
+ step2[5] = vsub_s16(step1[4], step1[5]);
+ step2[6] = vsub_s16(step1[7], step1[6]);
+ step2[7] = vadd_s16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vadd_s16(step2[1], step2[3]);
+ step1[1] = vadd_s16(step2[1], step2[2]);
+ step1[2] = vsub_s16(step2[1], step2[2]);
+ step1[3] = vsub_s16(step2[1], step2[3]);
+
+ t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
+ t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
+ t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
+ step1[5] = vrshrn_n_s32(t32[0], 14);
+ step1[6] = vrshrn_n_s32(t32[1], 14);
+
+ // stage 4
+ *io0 = vadd_s16(step1[0], step2[7]);
+ *io1 = vadd_s16(step1[1], step1[6]);
+ *io2 = vadd_s16(step1[2], step1[5]);
+ *io3 = vadd_s16(step1[3], step2[4]);
+ *io4 = vsub_s16(step1[3], step2[4]);
+ *io5 = vsub_s16(step1[2], step1[5]);
+ *io6 = vsub_s16(step1[1], step1[6]);
+ *io7 = vsub_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_pass2_bd8(
+ const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
+ const int16x4_t input0, const int16x4_t input1, const int16x4_t input2,
+ const int16x4_t input3, const int16x4_t input4, const int16x4_t input5,
+ const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0,
+ int16x8_t *const output1, int16x8_t *const output2,
+ int16x8_t *const output3, int16x8_t *const output4,
+ int16x8_t *const output5, int16x8_t *const output6,
+ int16x8_t *const output7) {
+ int16x8_t in[4];
+ int16x8_t step1[8], step2[8];
+ int32x4_t t32[8];
+ int16x4_t t16[8];
+
+ transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6,
+ input7, &in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3);
+ step1[5] = vqrdmulhq_lane_s16(in[3], cospisd1, 2);
+ step1[6] = vqrdmulhq_lane_s16(in[3], cospisd1, 1);
+ step1[7] = vqrdmulhq_lane_s16(in[1], cospisd1, 0);
+
+ // stage 2
+ step2[1] = vqrdmulhq_lane_s16(in[0], cospisd0, 2);
+ step2[2] = vqrdmulhq_lane_s16(in[2], cospisd0, 3);
+ step2[3] = vqrdmulhq_lane_s16(in[2], cospisd0, 1);
+
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s16(step2[1], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[1], step2[3]);
+
+ t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
+ t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
+ t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ t16[0] = vrshrn_n_s32(t32[0], 14);
+ t16[1] = vrshrn_n_s32(t32[1], 14);
+ t16[2] = vrshrn_n_s32(t32[2], 14);
+ t16[3] = vrshrn_n_s32(t32[3], 14);
+ step1[5] = vcombine_s16(t16[0], t16[1]);
+ step1[6] = vcombine_s16(t16[2], t16[3]);
+
+ // stage 4
+ *output0 = vaddq_s16(step1[0], step2[7]);
+ *output1 = vaddq_s16(step1[1], step1[6]);
+ *output2 = vaddq_s16(step1[2], step1[5]);
+ *output3 = vaddq_s16(step1[3], step2[4]);
+ *output4 = vsubq_s16(step1[3], step2[4]);
+ *output5 = vsubq_s16(step1[2], step1[5]);
+ *output6 = vsubq_s16(step1[1], step1[6]);
+ *output7 = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
+ const int16x4_t cospis1,
+ int16x8_t *const io0, int16x8_t *const io1,
+ int16x8_t *const io2, int16x8_t *const io3,
+ int16x8_t *const io4, int16x8_t *const io5,
+ int16x8_t *const io6,
+ int16x8_t *const io7) {
+ int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
+ input_7l, input_7h;
+ int16x4_t step1l[4], step1h[4];
+ int16x8_t step1[8], step2[8];
+ int32x4_t t32[8];
+ int16x4_t t16[8];
+
+ transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7);
+
+ // stage 1
+ input_1l = vget_low_s16(*io1);
+ input_1h = vget_high_s16(*io1);
+ input_3l = vget_low_s16(*io3);
+ input_3h = vget_high_s16(*io3);
+ input_5l = vget_low_s16(*io5);
+ input_5h = vget_high_s16(*io5);
+ input_7l = vget_low_s16(*io7);
+ input_7h = vget_high_s16(*io7);
+ step1l[0] = vget_low_s16(*io0);
+ step1h[0] = vget_high_s16(*io0);
+ step1l[1] = vget_low_s16(*io2);
+ step1h[1] = vget_high_s16(*io2);
+ step1l[2] = vget_low_s16(*io4);
+ step1h[2] = vget_high_s16(*io4);
+ step1l[3] = vget_low_s16(*io6);
+ step1h[3] = vget_high_s16(*io6);
+
+ t32[0] = vmull_lane_s16(input_1l, cospis1, 3);
+ t32[1] = vmull_lane_s16(input_1h, cospis1, 3);
+ t32[2] = vmull_lane_s16(input_3l, cospis1, 2);
+ t32[3] = vmull_lane_s16(input_3h, cospis1, 2);
+ t32[4] = vmull_lane_s16(input_3l, cospis1, 1);
+ t32[5] = vmull_lane_s16(input_3h, cospis1, 1);
+ t32[6] = vmull_lane_s16(input_1l, cospis1, 0);
+ t32[7] = vmull_lane_s16(input_1h, cospis1, 0);
+ t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0);
+ t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0);
+ t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1);
+ t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1);
+ t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2);
+ t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
+ t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
+ t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
+ t16[0] = vrshrn_n_s32(t32[0], 14);
+ t16[1] = vrshrn_n_s32(t32[1], 14);
+ t16[2] = vrshrn_n_s32(t32[2], 14);
+ t16[3] = vrshrn_n_s32(t32[3], 14);
+ t16[4] = vrshrn_n_s32(t32[4], 14);
+ t16[5] = vrshrn_n_s32(t32[5], 14);
+ t16[6] = vrshrn_n_s32(t32[6], 14);
+ t16[7] = vrshrn_n_s32(t32[7], 14);
+ step1[4] = vcombine_s16(t16[0], t16[1]);
+ step1[5] = vcombine_s16(t16[2], t16[3]);
+ step1[6] = vcombine_s16(t16[4], t16[5]);
+ step1[7] = vcombine_s16(t16[6], t16[7]);
+
+ // stage 2
+ t32[2] = vmull_lane_s16(step1l[0], cospis0, 2);
+ t32[3] = vmull_lane_s16(step1h[0], cospis0, 2);
+ t32[4] = vmull_lane_s16(step1l[1], cospis0, 3);
+ t32[5] = vmull_lane_s16(step1h[1], cospis0, 3);
+ t32[6] = vmull_lane_s16(step1l[1], cospis0, 1);
+ t32[7] = vmull_lane_s16(step1h[1], cospis0, 1);
+ t32[0] = vmlal_lane_s16(t32[2], step1l[2], cospis0, 2);
+ t32[1] = vmlal_lane_s16(t32[3], step1h[2], cospis0, 2);
+ t32[2] = vmlsl_lane_s16(t32[2], step1l[2], cospis0, 2);
+ t32[3] = vmlsl_lane_s16(t32[3], step1h[2], cospis0, 2);
+ t32[4] = vmlsl_lane_s16(t32[4], step1l[3], cospis0, 1);
+ t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
+ t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
+ t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
+ t16[0] = vrshrn_n_s32(t32[0], 14);
+ t16[1] = vrshrn_n_s32(t32[1], 14);
+ t16[2] = vrshrn_n_s32(t32[2], 14);
+ t16[3] = vrshrn_n_s32(t32[3], 14);
+ t16[4] = vrshrn_n_s32(t32[4], 14);
+ t16[5] = vrshrn_n_s32(t32[5], 14);
+ t16[6] = vrshrn_n_s32(t32[6], 14);
+ t16[7] = vrshrn_n_s32(t32[7], 14);
+ step2[0] = vcombine_s16(t16[0], t16[1]);
+ step2[1] = vcombine_s16(t16[2], t16[3]);
+ step2[2] = vcombine_s16(t16[4], t16[5]);
+ step2[3] = vcombine_s16(t16[6], t16[7]);
+
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s16(step2[0], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[0], step2[3]);
+
+ t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
+ t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
+ t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ t16[0] = vrshrn_n_s32(t32[0], 14);
+ t16[1] = vrshrn_n_s32(t32[1], 14);
+ t16[2] = vrshrn_n_s32(t32[2], 14);
+ t16[3] = vrshrn_n_s32(t32[3], 14);
+ step1[5] = vcombine_s16(t16[0], t16[1]);
+ step1[6] = vcombine_s16(t16[2], t16[3]);
+
+ // stage 4
+ *io0 = vaddq_s16(step1[0], step2[7]);
+ *io1 = vaddq_s16(step1[1], step1[6]);
+ *io2 = vaddq_s16(step1[2], step1[5]);
+ *io3 = vaddq_s16(step1[3], step2[4]);
+ *io4 = vsubq_s16(step1[3], step2[4]);
+ *io5 = vsubq_s16(step1[2], step1[5]);
+ *io6 = vsubq_s16(step1[1], step1[6]);
+ *io7 = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
+ int16x8_t *const d0,
+ int16x8_t *const d1) {
+ int16x4_t t16[4];
+
+ t16[0] = vrshrn_n_s32(t32[0], 14);
+ t16[1] = vrshrn_n_s32(t32[1], 14);
+ t16[2] = vrshrn_n_s32(t32[2], 14);
+ t16[3] = vrshrn_n_s32(t32[3], 14);
+ *d0 = vcombine_s16(t16[0], t16[1]);
+ *d1 = vcombine_s16(t16[2], t16[3]);
+}
+
+static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
+ const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int32x4_t *const t32) {
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1);
+}
+
+static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x8_t *const d0,
+ int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+ t32[2] = vnegq_s32(t32[2]);
+ t32[3] = vnegq_s32(t32[3]);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x8_t *const d0,
+ int16x8_t *const d1) {
+ int32x4_t t32[6];
+
+ t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2);
+ t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2);
+ t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+ t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+ t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+ t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
+ const int stride) {
+ uint8x8_t d = vld1_u8(*dest);
+ uint16x8_t q;
+
+ res = vrshrq_n_s16(res, 6);
+ q = vaddw_u8(vreinterpretq_u16_s16(res), d);
+ d = vqmovun_s16(vreinterpretq_s16_u16(q));
+ vst1_u8(*dest, d);
+ *dest += stride;
+}
+
#endif // VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c
index e150a5302d5..fb1fa6b681d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c
@@ -346,20 +346,54 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
vst1q_u8(dst, above_right);
}
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t A0_0 = vld1q_u8(above);
+ const uint8x16_t A0_1 = vld1q_u8(above + 16);
+ const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7);
+ const uint8x16_t A1_0 = vld1q_u8(above + 1);
+ const uint8x16_t A1_1 = vld1q_u8(above + 17);
+ const uint8x16_t A2_0 = vld1q_u8(above + 2);
+ const uint8x16_t A2_1 = vld1q_u8(above + 18);
+ const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0);
+ const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1);
+ uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0);
+ uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1);
+ int i;
+ (void)left;
+
+ vst1q_u8(dst, row_0);
+ dst += 16;
+ vst1q_u8(dst, row_1);
+ dst += stride - 16;
+
+ for (i = 0; i < 30; ++i) {
+ row_0 = vextq_u8(row_0, row_1, 1);
+ row_1 = vextq_u8(row_1, above_right, 1);
+ vst1q_u8(dst, row_0);
+ dst += 16;
+ vst1q_u8(dst, row_1);
+ dst += stride - 16;
+ }
+
+ vst1q_u8(dst, above_right);
+ dst += 16;
+ vst1q_u8(dst, row_1);
+}
+
// -----------------------------------------------------------------------------
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint8x8_t XABCD = vld1_u8(above - 1);
- const uint32x2_t zero = vdup_n_u32(0);
- const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
- const uint8x8_t LKJI = vrev64_u8(vreinterpret_u8_u32(IJKL));
- const uint8x8_t LKJIXABC = vext_u8(LKJI, XABCD, 4);
- const uint8x8_t KJIXABCD = vext_u8(LKJI, XABCD, 5);
- const uint8x8_t JIXABCD0 =
- vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(KJIXABCD), 8));
- const uint8x8_t avg1 = vhadd_u8(JIXABCD0, LKJIXABC);
- const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABCD);
+ const uint8x8_t XA0123 = vld1_u8(above - 1);
+ const uint8x8_t L0123 = vld1_u8(left);
+ const uint8x8_t L3210 = vrev64_u8(L0123);
+ const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4);
+ const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5);
+ const uint8x8_t L10XA0123_ =
+ vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8));
+ const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012);
+ const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123);
const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
@@ -374,6 +408,265 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
vst1_lane_u32((uint32_t *)dst, r3, 0);
}
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t XA0123456 = vld1_u8(above - 1);
+ const uint8x8_t A01234567 = vld1_u8(above);
+ const uint8x8_t A1234567_ = vld1_u8(above + 1);
+ const uint8x8_t L01234567 = vld1_u8(left);
+ const uint8x8_t L76543210 = vrev64_u8(L01234567);
+ const uint8x8_t L6543210X = vext_u8(L76543210, XA0123456, 1);
+ const uint8x8_t L543210XA0 = vext_u8(L76543210, XA0123456, 2);
+ const uint8x16_t L76543210XA0123456 = vcombine_u8(L76543210, XA0123456);
+ const uint8x16_t L6543210XA01234567 = vcombine_u8(L6543210X, A01234567);
+ const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_);
+ const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_);
+ const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567);
+ const uint8x8_t row_0 = vget_low_u8(row);
+ const uint8x8_t row_1 = vget_high_u8(row);
+ const uint8x8_t r0 = vext_u8(row_0, row_1, 7);
+ const uint8x8_t r1 = vext_u8(row_0, row_1, 6);
+ const uint8x8_t r2 = vext_u8(row_0, row_1, 5);
+ const uint8x8_t r3 = vext_u8(row_0, row_1, 4);
+ const uint8x8_t r4 = vext_u8(row_0, row_1, 3);
+ const uint8x8_t r5 = vext_u8(row_0, row_1, 2);
+ const uint8x8_t r6 = vext_u8(row_0, row_1, 1);
+
+ vst1_u8(dst, r0);
+ dst += stride;
+ vst1_u8(dst, r1);
+ dst += stride;
+ vst1_u8(dst, r2);
+ dst += stride;
+ vst1_u8(dst, r3);
+ dst += stride;
+ vst1_u8(dst, r4);
+ dst += stride;
+ vst1_u8(dst, r5);
+ dst += stride;
+ vst1_u8(dst, r6);
+ dst += stride;
+ vst1_u8(dst, row_0);
+}
+
+static INLINE void d135_store_16x8(
+ uint8_t **dst, const ptrdiff_t stride, const uint8x16_t row_0,
+ const uint8x16_t row_1, const uint8x16_t row_2, const uint8x16_t row_3,
+ const uint8x16_t row_4, const uint8x16_t row_5, const uint8x16_t row_6,
+ const uint8x16_t row_7) {
+ vst1q_u8(*dst, row_0);
+ *dst += stride;
+ vst1q_u8(*dst, row_1);
+ *dst += stride;
+ vst1q_u8(*dst, row_2);
+ *dst += stride;
+ vst1q_u8(*dst, row_3);
+ *dst += stride;
+ vst1q_u8(*dst, row_4);
+ *dst += stride;
+ vst1q_u8(*dst, row_5);
+ *dst += stride;
+ vst1q_u8(*dst, row_6);
+ *dst += stride;
+ vst1q_u8(*dst, row_7);
+ *dst += stride;
+}
+
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t XA0123456789abcde = vld1q_u8(above - 1);
+ const uint8x16_t A0123456789abcdef = vld1q_u8(above);
+ const uint8x16_t A123456789abcdef_ = vld1q_u8(above + 1);
+ const uint8x16_t L0123456789abcdef = vld1q_u8(left);
+ const uint8x8_t L76543210 = vrev64_u8(vget_low_u8(L0123456789abcdef));
+ const uint8x8_t Lfedcba98 = vrev64_u8(vget_high_u8(L0123456789abcdef));
+ const uint8x16_t Lfedcba9876543210 = vcombine_u8(Lfedcba98, L76543210);
+ const uint8x16_t Ledcba9876543210X =
+ vextq_u8(Lfedcba9876543210, XA0123456789abcde, 1);
+ const uint8x16_t Ldcba9876543210XA0 =
+ vextq_u8(Lfedcba9876543210, XA0123456789abcde, 2);
+ const uint8x16_t avg_0 = vhaddq_u8(Lfedcba9876543210, Ldcba9876543210XA0);
+ const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_);
+ const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X);
+ const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef);
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
+ const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14);
+ const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13);
+ const uint8x16_t r_3 = vextq_u8(row_0, row_1, 12);
+ const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11);
+ const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10);
+ const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9);
+ const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1));
+ const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7);
+ const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6);
+ const uint8x16_t r_a = vextq_u8(row_0, row_1, 5);
+ const uint8x16_t r_b = vextq_u8(row_0, row_1, 4);
+ const uint8x16_t r_c = vextq_u8(row_0, row_1, 3);
+ const uint8x16_t r_d = vextq_u8(row_0, row_1, 2);
+ const uint8x16_t r_e = vextq_u8(row_0, row_1, 1);
+
+ d135_store_16x8(&dst, stride, r_0, r_1, r_2, r_3, r_4, r_5, r_6, r_7);
+ d135_store_16x8(&dst, stride, r_8, r_9, r_a, r_b, r_c, r_d, r_e, row_0);
+}
+
+static INLINE void d135_store_32x2(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x16_t row_0,
+ const uint8x16_t row_1,
+ const uint8x16_t row_2) {
+ uint8_t *dst2 = *dst;
+ vst1q_u8(dst2, row_1);
+ dst2 += 16;
+ vst1q_u8(dst2, row_2);
+ dst2 += 16 * stride - 16;
+ vst1q_u8(dst2, row_0);
+ dst2 += 16;
+ vst1q_u8(dst2, row_1);
+ *dst += stride;
+}
+
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t LL0123456789abcdef = vld1q_u8(left + 16);
+ const uint8x16_t LU0123456789abcdef = vld1q_u8(left);
+ const uint8x8_t LL76543210 = vrev64_u8(vget_low_u8(LL0123456789abcdef));
+ const uint8x8_t LU76543210 = vrev64_u8(vget_low_u8(LU0123456789abcdef));
+ const uint8x8_t LLfedcba98 = vrev64_u8(vget_high_u8(LL0123456789abcdef));
+ const uint8x8_t LUfedcba98 = vrev64_u8(vget_high_u8(LU0123456789abcdef));
+ const uint8x16_t LLfedcba9876543210 = vcombine_u8(LLfedcba98, LL76543210);
+ const uint8x16_t LUfedcba9876543210 = vcombine_u8(LUfedcba98, LU76543210);
+ const uint8x16_t LLedcba9876543210Uf =
+ vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 1);
+ const uint8x16_t LLdcba9876543210Ufe =
+ vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 2);
+ const uint8x16_t avg_0 = vhaddq_u8(LLfedcba9876543210, LLdcba9876543210Ufe);
+ const uint8x16_t row_0 = vrhaddq_u8(avg_0, LLedcba9876543210Uf);
+
+ const uint8x16_t XAL0123456789abcde = vld1q_u8(above - 1);
+ const uint8x16_t LUedcba9876543210X =
+ vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 1);
+ const uint8x16_t LUdcba9876543210XA0 =
+ vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 2);
+ const uint8x16_t avg_1 = vhaddq_u8(LUfedcba9876543210, LUdcba9876543210XA0);
+ const uint8x16_t row_1 = vrhaddq_u8(avg_1, LUedcba9876543210X);
+
+ const uint8x16_t AL0123456789abcdef = vld1q_u8(above);
+ const uint8x16_t AL123456789abcdefg = vld1q_u8(above + 1);
+ const uint8x16_t ALfR0123456789abcde = vld1q_u8(above + 15);
+ const uint8x16_t AR0123456789abcdef = vld1q_u8(above + 16);
+ const uint8x16_t AR123456789abcdef_ = vld1q_u8(above + 17);
+ const uint8x16_t avg_2 = vhaddq_u8(XAL0123456789abcde, AL123456789abcdefg);
+ const uint8x16_t row_2 = vrhaddq_u8(avg_2, AL0123456789abcdef);
+ const uint8x16_t avg_3 = vhaddq_u8(ALfR0123456789abcde, AR123456789abcdef_);
+ const uint8x16_t row_3 = vrhaddq_u8(avg_3, AR0123456789abcdef);
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 15);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 15);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 14);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 14);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 14);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 13);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 13);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 13);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 12);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 12);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 12);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 11);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 11);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 11);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 10);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 10);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 10);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 9);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 9);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 9);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 8);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 8);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 8);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 7);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 7);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 7);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 6);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 6);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 6);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 5);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 5);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 5);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 4);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 4);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 4);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 3);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 3);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 3);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 2);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 2);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 2);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 1);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 1);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 1);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ d135_store_32x2(&dst, stride, row_0, row_1, row_2);
+}
+
// -----------------------------------------------------------------------------
#if !HAVE_NEON_ASM
@@ -483,133 +776,98 @@ void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
vst1_u8(dst, d);
}
+static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x8_t left) {
+ const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+ const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+ const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+ const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+ const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+ const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+ const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+ const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+ vst1q_u8(*dst, row_0);
+ *dst += stride;
+ vst1q_u8(*dst, row_1);
+ *dst += stride;
+ vst1q_u8(*dst, row_2);
+ *dst += stride;
+ vst1q_u8(*dst, row_3);
+ *dst += stride;
+ vst1q_u8(*dst, row_4);
+ *dst += stride;
+ vst1q_u8(*dst, row_5);
+ *dst += stride;
+ vst1q_u8(*dst, row_6);
+ *dst += stride;
+ vst1q_u8(*dst, row_7);
+ *dst += stride;
+}
+
void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const uint8x16_t left_u8q = vld1q_u8(left);
- uint8x8_t left_u8d = vget_low_u8(left_u8q);
- uint8x16_t d;
- int i;
(void)above;
- for (i = 0; i < 2; i++, left_u8d = vget_high_u8(left_u8q)) {
- d = vdupq_lane_u8(left_u8d, 0);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 1);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 2);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 3);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 4);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 5);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 6);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 7);
- vst1q_u8(dst, d);
- dst += stride;
- }
+ h_store_16x8(&dst, stride, vget_low_u8(left_u8q));
+ h_store_16x8(&dst, stride, vget_high_u8(left_u8q));
+}
+
+static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x8_t left) {
+ const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+ const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+ const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+ const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+ const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+ const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+ const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+ const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+ vst1q_u8(*dst, row_0); // Note clang-3.8 produced poor code w/vst2q_u8
+ *dst += 16;
+ vst1q_u8(*dst, row_0);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_1);
+ *dst += 16;
+ vst1q_u8(*dst, row_1);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_2);
+ *dst += 16;
+ vst1q_u8(*dst, row_2);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_3);
+ *dst += 16;
+ vst1q_u8(*dst, row_3);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_4);
+ *dst += 16;
+ vst1q_u8(*dst, row_4);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_5);
+ *dst += 16;
+ vst1q_u8(*dst, row_5);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_6);
+ *dst += 16;
+ vst1q_u8(*dst, row_6);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_7);
+ *dst += 16;
+ vst1q_u8(*dst, row_7);
+ *dst += stride - 16;
}
void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- uint8x16_t d;
int i;
(void)above;
for (i = 0; i < 2; i++, left += 16) {
const uint8x16_t left_u8 = vld1q_u8(left);
- const uint8x8_t left_low = vget_low_u8(left_u8);
- const uint8x8_t left_high = vget_high_u8(left_u8);
- d = vdupq_lane_u8(left_low, 0);
- vst1q_u8(dst, d); // Note clang-3.8 produced poor code w/vst2q_u8
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 1);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 2);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 3);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 4);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 5);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 6);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 7);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
-
- d = vdupq_lane_u8(left_high, 0);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 1);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 2);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 3);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 4);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 5);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 6);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 7);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
+ h_store_32x8(&dst, stride, vget_low_u8(left_u8));
+ h_store_32x8(&dst, stride, vget_high_u8(left_u8));
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
index 5cd9170aea7..907e9183804 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
@@ -11,6 +11,7 @@
EXPORT |vpx_lpf_horizontal_4_neon|
EXPORT |vpx_lpf_vertical_4_neon|
EXPORT |vpx_lpf_horizontal_4_dual_neon|
+ EXPORT |vpx_lpf_vertical_4_dual_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
@@ -54,7 +55,7 @@
sub r2, r2, r1, lsl #1
sub r3, r3, r1, lsl #1
- bl vpx_loop_filter_neon
+ bl filter4_8
vst1.u8 {d4}, [r2@64], r1 ; store op1
vst1.u8 {d5}, [r3@64], r1 ; store op0
@@ -114,7 +115,7 @@
vtrn.8 d7, d16
vtrn.8 d17, d18
- bl vpx_loop_filter_neon
+ bl filter4_8
sub r0, r0, #2
@@ -131,7 +132,7 @@
pop {pc}
ENDP ; |vpx_lpf_vertical_4_neon|
-; void vpx_loop_filter_neon();
+; void filter4_8();
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store. The function does not use
; registers d8-d15.
@@ -155,7 +156,7 @@
; d5 op0
; d6 oq0
; d7 oq1
-|vpx_loop_filter_neon| PROC
+|filter4_8| PROC
; filter_mask
vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
@@ -245,7 +246,7 @@
veor d7, d20, d18 ; *oq1 = u^0x80
bx lr
- ENDP ; |vpx_loop_filter_neon|
+ ENDP ; |filter4_8|
;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
; const uint8_t *blimit0,
@@ -300,7 +301,7 @@
sub r2, r2, r1, lsl #1
sub r3, r3, r1, lsl #1
- bl vpx_loop_filter_neon_16
+ bl filter4_16
vst1.u8 {q5}, [r2@64], r1 ; store op1
vst1.u8 {q6}, [r3@64], r1 ; store op0
@@ -312,7 +313,122 @@
pop {pc}
ENDP ; |vpx_lpf_horizontal_4_dual_neon|
-; void vpx_loop_filter_neon_16();
+;void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int p,
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp+4 const uint8_t *blimit1,
+; sp+8 const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vpx_lpf_vertical_4_dual_neon| PROC
+ push {lr}
+
+ ldr r12, [sp, #4] ; load thresh0
+ vld1.8 {d0}, [r2] ; load blimit0 to first half q
+ vld1.8 {d2}, [r3] ; load limit0 to first half q
+
+ ldr r2, [sp, #8] ; load blimit1
+
+ vld1.8 {d4}, [r12] ; load thresh0 to first half q
+
+ ldr r3, [sp, #12] ; load limit1
+ ldr r12, [sp, #16] ; load thresh1
+ vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q
+
+ sub r2, r0, #4 ; s[-4]
+
+ vld1.8 {d3}, [r3] ; load limit1 to 2nd half q
+ vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q
+
+ vpush {d8-d15} ; save neon registers
+
+ vld1.u8 {d6}, [r2], r1 ; 00 01 02 03 04 05 06 07
+ vld1.u8 {d8}, [r2], r1 ; 10 11 12 13 14 15 16 17
+ vld1.u8 {d10}, [r2], r1 ; 20 21 22 23 24 25 26 27
+ vld1.u8 {d12}, [r2], r1 ; 30 31 32 33 34 35 36 37
+ vld1.u8 {d14}, [r2], r1 ; 40 41 42 43 44 45 46 47
+ vld1.u8 {d16}, [r2], r1 ; 50 51 52 53 54 55 56 57
+ vld1.u8 {d18}, [r2], r1 ; 60 61 62 63 64 65 66 67
+ vld1.u8 {d20}, [r2], r1 ; 70 71 72 73 74 75 76 77
+ vld1.u8 {d7}, [r2], r1 ; 80 81 82 83 84 85 86 87
+ vld1.u8 {d9}, [r2], r1 ; 90 91 92 93 94 95 96 97
+ vld1.u8 {d11}, [r2], r1 ; A0 A1 A2 A3 A4 A5 A6 A7
+ vld1.u8 {d13}, [r2], r1 ; B0 B1 B2 B3 B4 B5 B6 B7
+ vld1.u8 {d15}, [r2], r1 ; C0 C1 C2 C3 C4 C5 C6 C7
+ vld1.u8 {d17}, [r2], r1 ; D0 D1 D2 D3 D4 D5 D6 D7
+ vld1.u8 {d19}, [r2], r1 ; E0 E1 E2 E3 E4 E5 E6 E7
+ vld1.u8 {d21}, [r2] ; F0 F1 F2 F3 F4 F5 F6 F7
+
+ vtrn.8 q3, q4 ; q3 : 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
+ ; q4 : 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
+ vtrn.8 q5, q6 ; q5 : 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6
+ ; q6 : 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7
+ vtrn.8 q7, q8 ; q7 : 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6
+ ; q8 : 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7
+ vtrn.8 q9, q10 ; q9 : 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6
+ ; q10: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7
+
+ vtrn.16 q3, q5 ; q3 : 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4
+ ; q5 : 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6
+ vtrn.16 q4, q6 ; q4 : 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5
+ ; q6 : 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7
+ vtrn.16 q7, q9 ; q7 : 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4
+ ; q9 : 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6
+ vtrn.16 q8, q10 ; q8 : 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5
+ ; q10: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7
+
+ vtrn.32 q3, q7 ; q3 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
+ ; q7 : 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
+ vtrn.32 q5, q9 ; q5 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
+ ; q9 : 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
+ vtrn.32 q4, q8 ; q4 : 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
+ ; q8 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
+ vtrn.32 q6, q10 ; q6 : 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
+ ; q10: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
+
+ bl filter4_16
+
+ sub r0, #2
+
+ vmov d0, d11
+ vmov d1, d13
+ vmov d2, d15
+ vmov d3, d17
+ vmov d11, d12
+ vmov d12, d14
+ vmov d13, d16
+ vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+ vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+ vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+ vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+ vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+ vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+ vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+ vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r0], r1
+ vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r0], r1
+ vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r0], r1
+ vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r0]
+
+ vpop {d8-d15} ; restore neon registers
+
+ pop {pc}
+ ENDP ; |vpx_lpf_vertical_4_dual_neon|
+
+; void filter4_16();
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store. This function uses
; registers d8-d15, so the calling function must save those registers.
@@ -335,7 +451,7 @@
; q6 op0
; q7 oq0
; q8 oq1
-|vpx_loop_filter_neon_16| PROC
+|filter4_16| PROC
; filter_mask
vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2)
@@ -428,6 +544,6 @@
veor q8, q12, q10 ; *oq1 = u^0x80
bx lr
- ENDP ; |vpx_loop_filter_neon_16|
+ ENDP ; |filter4_16|
END
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c
deleted file mode 100644
index ced5aef0ab2..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
- vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
index 445add29689..8366ce50b87 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -21,7 +21,7 @@
//
// b0.val[0]: 00 01 02 03 16 17 18 19
// b0.val[1]: 04 05 06 07 20 21 22 23
-static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
+static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
int16x8x2_t b0;
b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
vreinterpret_s16_s32(vget_low_s32(a1)));
@@ -30,7 +30,23 @@ static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
return b0;
}
-static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {
+static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+ int32x4x2_t b0;
+ b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+ b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+ return b0;
+}
+
+static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
+ int64x2x2_t b0;
+ b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
+ vreinterpret_s64_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
+ vreinterpret_s64_s32(vget_high_s32(a1)));
+ return b0;
+}
+
+static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
uint8x16x2_t b0;
b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
vreinterpret_u8_u32(vget_low_u32(a1)));
@@ -110,6 +126,37 @@ static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
*a3 = vreinterpret_s16_s32(c1.val[1]);
}
+static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 10 11 12 13
+ // a1: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const int32x4x2_t b0 =
+ vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1));
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 02 03 22 23
+ // c0.val[1]: 10 11 30 31 12 13 32 33
+
+ const int32x4_t c0 =
+ vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1]));
+ const int32x4_t c1 =
+ vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1]));
+
+ // Swap 16 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 02 12 22 32
+ // d0.val[1]: 01 11 21 31 03 13 23 33
+
+ const int16x8x2_t d0 =
+ vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1));
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+}
+
static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
// Swap 32 bit elements. Goes from:
// a0: 00 01 02 03 10 11 12 13
@@ -141,6 +188,211 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
*a1 = d0.val[1];
}
+static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3, const uint8x8_t a4,
+ const uint8x8_t a5, const uint8x8_t a6,
+ const uint8x8_t a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 XX XX XX XX
+ // a1: 10 11 12 13 XX XX XX XX
+ // a2: 20 21 22 23 XX XX XX XX
+ // a3; 30 31 32 33 XX XX XX XX
+ // a4: 40 41 42 43 XX XX XX XX
+ // a5: 50 51 52 53 XX XX XX XX
+ // a6: 60 61 62 63 XX XX XX XX
+ // a7: 70 71 72 73 XX XX XX XX
+ // to:
+ // b0.val[0]: 00 01 02 03 40 41 42 43
+ // b1.val[0]: 10 11 12 13 50 51 52 53
+ // b2.val[0]: 20 21 22 23 60 61 62 63
+ // b3.val[0]: 30 31 32 33 70 71 72 73
+
+ const uint32x2x2_t b0 =
+ vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+ const uint32x2x2_t b1 =
+ vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+ const uint32x2x2_t b2 =
+ vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+ const uint32x2x2_t b3 =
+ vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 40 41 60 61
+ // c0.val[1]: 02 03 22 23 42 43 62 63
+ // c1.val[0]: 10 11 30 31 50 51 70 71
+ // c1.val[1]: 12 13 32 33 52 53 72 73
+
+ const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+ vreinterpret_u16_u32(b2.val[0]));
+ const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+ vreinterpret_u16_u32(b3.val[0]));
+
+ // Swap 8 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 01 11 21 31 41 51 61 71
+ // d1.val[0]: 02 12 22 32 42 52 62 72
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const uint8x8x2_t d0 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+ const uint8x8x2_t d1 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+ *a2 = d1.val[0];
+ *a3 = d1.val[1];
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+ int32x4_t *a2, int32x4_t *a3) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+ const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+ *a0 = c0.val[0];
+ *a1 = c1.val[0];
+ *a2 = c0.val[1];
+ *a3 = c1.val[1];
+}
+
+static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
+ const int16x4_t a2, const int16x4_t a3,
+ const int16x4_t a4, const int16x4_t a5,
+ const int16x4_t a6, const int16x4_t a7,
+ int16x8_t *const o0, int16x8_t *const o1,
+ int16x8_t *const o2, int16x8_t *const o3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+ // b2.val[0]: 40 50 42 52
+ // b2.val[1]: 41 51 43 53
+ // b3.val[0]: 60 70 62 72
+ // b3.val[1]: 61 71 63 73
+
+ const int16x4x2_t b0 = vtrn_s16(a0, a1);
+ const int16x4x2_t b1 = vtrn_s16(a2, a3);
+ const int16x4x2_t b2 = vtrn_s16(a4, a5);
+ const int16x4x2_t b3 = vtrn_s16(a6, a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 40 50 60 70
+ // c2.val[1]: 42 52 62 72
+ // c3.val[0]: 41 51 61 71
+ // c3.val[1]: 43 53 63 73
+
+ const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+ vreinterpret_s32_s16(b1.val[0]));
+ const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+ vreinterpret_s32_s16(b1.val[1]));
+ const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
+ vreinterpret_s32_s16(b3.val[0]));
+ const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
+ vreinterpret_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // o0: 00 10 20 30 40 50 60 70
+ // o1: 01 11 21 31 41 51 61 71
+ // o2: 02 12 22 32 42 52 62 72
+ // o3: 03 13 23 33 43 53 63 73
+
+ *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
+ vreinterpret_s16_s32(c2.val[0]));
+ *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
+ vreinterpret_s16_s32(c3.val[0]));
+ *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
+ vreinterpret_s16_s32(c2.val[1]));
+ *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
+ vreinterpret_s16_s32(c3.val[1]));
+}
+
+static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
+ int32x4_t *const a2, int32x4_t *const a3,
+ int32x4_t *const a4, int32x4_t *const a5,
+ int32x4_t *const a6, int32x4_t *const a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+ // b2.val[0]: 40 50 42 52
+ // b2.val[1]: 41 51 43 53
+ // b3.val[0]: 60 70 62 72
+ // b3.val[1]: 61 71 63 73
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+ const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+ const int32x4x2_t b2 = vtrnq_s32(*a4, *a5);
+ const int32x4x2_t b3 = vtrnq_s32(*a6, *a7);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 40 50 60 70
+ // c2.val[1]: 42 52 62 72
+ // c3.val[0]: 41 51 61 71
+ // c3.val[1]: 43 53 63 73
+
+ const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]);
+ const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]);
+ const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]);
+ const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]);
+
+ *a0 = vreinterpretq_s32_s64(c0.val[0]);
+ *a1 = vreinterpretq_s32_s64(c2.val[0]);
+ *a2 = vreinterpretq_s32_s64(c1.val[0]);
+ *a3 = vreinterpretq_s32_s64(c3.val[0]);
+ *a4 = vreinterpretq_s32_s64(c0.val[1]);
+ *a5 = vreinterpretq_s32_s64(c2.val[1]);
+ *a6 = vreinterpretq_s32_s64(c1.val[1]);
+ *a7 = vreinterpretq_s32_s64(c3.val[1]);
+}
+
static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
uint8x8_t *a3) {
// Swap 8 bit elements. Goes from:
@@ -207,6 +459,59 @@ static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1,
*a3 = vreinterpretq_u16_u32(c1.val[1]);
}
+static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
+ int32x4_t *const a2, int32x4_t *const a3,
+ int32x4_t *const a4, int32x4_t *const a5,
+ int32x4_t *const a6, int32x4_t *const a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 04 05 06 07
+ // a2: 10 11 12 13
+ // a3: 14 15 16 17
+ // a4: 20 21 22 23
+ // a5: 24 25 26 27
+ // a6: 30 31 32 33
+ // a7: 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 04 14 06 16
+ // b1.val[1]: 05 15 07 17
+ // b2.val[0]: 20 30 22 32
+ // b2.val[1]: 21 31 23 33
+ // b3.val[0]: 24 34 26 36
+ // b3.val[1]: 25 35 27 37
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a2);
+ const int32x4x2_t b1 = vtrnq_s32(*a1, *a3);
+ const int32x4x2_t b2 = vtrnq_s32(*a4, *a6);
+ const int32x4x2_t b3 = vtrnq_s32(*a5, *a7);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 04 14 24 34
+ // c2.val[1]: 06 16 26 36
+ // c3.val[0]: 05 15 25 35
+ // c3.val[1]: 07 17 27 37
+
+ const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]);
+ const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]);
+ const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]);
+ const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]);
+
+ *a0 = vreinterpretq_s32_s64(c0.val[0]);
+ *a1 = vreinterpretq_s32_s64(c1.val[0]);
+ *a2 = vreinterpretq_s32_s64(c0.val[1]);
+ *a3 = vreinterpretq_s32_s64(c1.val[1]);
+ *a4 = vreinterpretq_s32_s64(c2.val[0]);
+ *a5 = vreinterpretq_s32_s64(c3.val[0]);
+ *a6 = vreinterpretq_s32_s64(c2.val[1]);
+ *a7 = vreinterpretq_s32_s64(c3.val[1]);
+}
+
// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
// 'q' registers here to save some instructions.
static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
@@ -319,10 +624,10 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
// d2.val[1]: 06 16 26 36 46 56 66 76
// d3.val[0]: 03 13 23 33 43 53 63 73
// d3.val[1]: 07 17 27 37 47 57 67 77
- const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]);
- const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]);
- const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]);
- const int16x8x2_t d3 = vpx_vtrnq_s64(c1.val[1], c3.val[1]);
+ const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
*a0 = d0.val[0];
*a1 = d1.val[0];
@@ -758,14 +1063,14 @@ static INLINE void transpose_u8_16x16(
// e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
// e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
// e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
- const uint8x16x2_t e0 = vpx_vtrnq_u64(d0.val[0], d4.val[0]);
- const uint8x16x2_t e1 = vpx_vtrnq_u64(d2.val[0], d6.val[0]);
- const uint8x16x2_t e2 = vpx_vtrnq_u64(d1.val[0], d5.val[0]);
- const uint8x16x2_t e3 = vpx_vtrnq_u64(d3.val[0], d7.val[0]);
- const uint8x16x2_t e4 = vpx_vtrnq_u64(d0.val[1], d4.val[1]);
- const uint8x16x2_t e5 = vpx_vtrnq_u64(d2.val[1], d6.val[1]);
- const uint8x16x2_t e6 = vpx_vtrnq_u64(d1.val[1], d5.val[1]);
- const uint8x16x2_t e7 = vpx_vtrnq_u64(d3.val[1], d7.val[1]);
+ const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]);
+ const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]);
+ const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]);
+ const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]);
+ const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]);
+ const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]);
+ const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]);
+ const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]);
// Output:
// o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
@@ -802,4 +1107,101 @@ static INLINE void transpose_u8_16x16(
*o15 = e7.val[1];
}
+static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
+ const int a_stride, uint8x8_t *a0,
+ uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3) {
+ uint8x8_t a4, a5, a6, a7;
+ *a0 = vld1_u8(a);
+ a += a_stride;
+ *a1 = vld1_u8(a);
+ a += a_stride;
+ *a2 = vld1_u8(a);
+ a += a_stride;
+ *a3 = vld1_u8(a);
+ a += a_stride;
+ a4 = vld1_u8(a);
+ a += a_stride;
+ a5 = vld1_u8(a);
+ a += a_stride;
+ a6 = vld1_u8(a);
+ a += a_stride;
+ a7 = vld1_u8(a);
+
+ transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void load_and_transpose_u8_8x8(const uint8_t *a,
+ const int a_stride, uint8x8_t *a0,
+ uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3, uint8x8_t *a4,
+ uint8x8_t *a5, uint8x8_t *a6,
+ uint8x8_t *a7) {
+ *a0 = vld1_u8(a);
+ a += a_stride;
+ *a1 = vld1_u8(a);
+ a += a_stride;
+ *a2 = vld1_u8(a);
+ a += a_stride;
+ *a3 = vld1_u8(a);
+ a += a_stride;
+ *a4 = vld1_u8(a);
+ a += a_stride;
+ *a5 = vld1_u8(a);
+ a += a_stride;
+ *a6 = vld1_u8(a);
+ a += a_stride;
+ *a7 = vld1_u8(a);
+
+ transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride,
+ uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x8_t a4, uint8x8_t a5,
+ uint8x8_t a6, uint8x8_t a7) {
+ transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ vst1_u8(a, a0);
+ a += a_stride;
+ vst1_u8(a, a1);
+ a += a_stride;
+ vst1_u8(a, a2);
+ a += a_stride;
+ vst1_u8(a, a3);
+ a += a_stride;
+ vst1_u8(a, a4);
+ a += a_stride;
+ vst1_u8(a, a5);
+ a += a_stride;
+ vst1_u8(a, a6);
+ a += a_stride;
+ vst1_u8(a, a7);
+}
+
+static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
+ const int a_stride, int16x8_t *a0,
+ int16x8_t *a1, int16x8_t *a2,
+ int16x8_t *a3, int16x8_t *a4,
+ int16x8_t *a5, int16x8_t *a6,
+ int16x8_t *a7) {
+ *a0 = vld1q_s16(a);
+ a += a_stride;
+ *a1 = vld1q_s16(a);
+ a += a_stride;
+ *a2 = vld1q_s16(a);
+ a += a_stride;
+ *a3 = vld1q_s16(a);
+ a += a_stride;
+ *a4 = vld1q_s16(a);
+ a += a_stride;
+ *a5 = vld1q_s16(a);
+ a += a_stride;
+ *a6 = vld1q_s16(a);
+ a += a_stride;
+ *a7 = vld1q_s16(a);
+
+ transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
#endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c
index 589b124e26a..6c27484979a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c
@@ -117,7 +117,7 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
unsigned char d[16];
for (r = 0; r < rows; r++) {
- int sumsq = 0;
+ int sumsq = 16;
int sum = 0;
for (i = -8; i < 0; i++) s[i] = s[0];
@@ -156,14 +156,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
int flimit) {
int r, c, i;
- const int16_t *rv3 = &vpx_rv[63 & rand()];
for (c = 0; c < cols; c++) {
unsigned char *s = &dst[c];
int sumsq = 0;
int sum = 0;
unsigned char d[16];
- const int16_t *rv2 = rv3 + ((c * 17) & 127);
for (i = -8; i < 0; i++) s[i * pitch] = s[0];
@@ -183,7 +181,7 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
d[r & 15] = s[0];
if (sumsq * 15 - sum * sum < flimit) {
- d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+ d[r & 15] = (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4;
}
if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
s += pitch;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
index f3f543ddfe8..0f9aff1892a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
@@ -67,7 +67,7 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
}
}
-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
+void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
int i;
tran_high_t a1, e1;
tran_low_t tmp[4];
@@ -84,10 +84,10 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
for (i = 0; i < 4; i++) {
e1 = ip[0] >> 1;
a1 = ip[0] - e1;
- dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
- dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
- dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
- dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
+ dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
+ dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
+ dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
+ dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
ip++;
dest++;
}
@@ -138,8 +138,7 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
}
}
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i;
tran_high_t a1;
tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
@@ -152,7 +151,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
dest[1] = clip_pixel_add(dest[1], a1);
dest[2] = clip_pixel_add(dest[2], a1);
dest[3] = clip_pixel_add(dest[3], a1);
- dest += dest_stride;
+ dest += stride;
}
}
@@ -1324,7 +1323,7 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
}
void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
- int dest_stride, int bd) {
+ int stride, int bd) {
int i;
tran_high_t a1, e1;
tran_low_t tmp[4];
@@ -1343,14 +1342,10 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
for (i = 0; i < 4; i++) {
e1 = ip[0] >> 1;
a1 = ip[0] - e1;
- dest[dest_stride * 0] =
- highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
- dest[dest_stride * 1] =
- highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
- dest[dest_stride * 2] =
- highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
- dest[dest_stride * 3] =
- highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
+ dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+ dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
+ dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
+ dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
ip++;
dest++;
}
@@ -1413,7 +1408,7 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
}
void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int dest_stride, int bd) {
+ int stride, int bd) {
int i;
tran_high_t a1;
tran_low_t out =
@@ -1428,7 +1423,7 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
- dest += dest_stride;
+ dest += stride;
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
index 31812299c34..b4ed6ee850a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -403,8 +403,11 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int filter_y_stride, int w,
int h) {
int x, y;
- uint32_t tp1, tp2, tn1;
- uint32_t tp3, tp4, tn2;
+ uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
/* prefetch data to cache memory */
prefetch_load(src);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c
index f6812c7d049..8d35b6394e2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c
@@ -1307,6 +1307,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
assert(y_step_q4 == 16);
assert(((const int32_t *)filter_x)[1] != 0x800000);
assert(((const int32_t *)filter_y)[1] != 0x800000);
+ (void)x_step_q4;
/* bit positon for extract from acc */
__asm__ __volatile__("wrdsp %[pos], 1 \n\t"
@@ -1398,6 +1399,10 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h) {
int x, y;
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
/* prefetch data to cache memory */
prefetch_load(src);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
index cc633c6698d..e33ea740a9e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
@@ -459,7 +459,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
flimit_vec = __msa_fill_w(flimit);
for (row = rows; row--;) {
- int32_t sum_sq = 0;
+ int32_t sum_sq;
int32_t sum = 0;
src0 = (v16u8)__msa_fill_b(src_dup[0]);
ST8x1_UB(src0, (src_dup - 8));
@@ -474,7 +474,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
src_r_w += __msa_dotp_u_w(src_l_h, src_l_h);
- sum_sq = HADD_SW_S32(src_r_w);
+ sum_sq = HADD_SW_S32(src_r_w) + 16;
sum_h = __msa_hadd_u_h(src, src);
sum = HADD_UH_U32(sum_h);
{
@@ -573,7 +573,6 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
int32_t cols, int32_t flimit) {
int32_t row, col, cnt, i;
- const int16_t *rv3 = &vpx_rv[63 & rand()];
v4i32 flimit_vec;
v16u8 dst7, dst8, dst_r_b, dst_l_b;
v16i8 mask;
@@ -601,7 +600,7 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
dst = LD_UB(dst_tmp);
for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
- rv2[i] = rv3 + ((cnt * 17) & 127);
+ rv2[i] = vpx_rv + (i & 7);
++i;
}
for (cnt = -8; cnt < 0; ++cnt) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
index 3e29d0ac39f..835e10e125c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
@@ -15,6 +15,7 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ (void)above;
__asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
index 9f51d50c752..dce03a2b2a0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
@@ -14,6 +14,7 @@
void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int32_t tmp1, tmp2, tmp3, tmp4;
+ (void)above;
__asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
index eac79d51000..16e7fc55079 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
@@ -14,6 +14,7 @@
void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ (void)above;
__asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
index edd54aec5e2..27881f0db6c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
@@ -57,18 +57,15 @@ extern "C" {
out; \
})
-void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride);
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output);
-void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride);
+void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
void iadst4_dspr2(const int16_t *input, int16_t *output);
void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride);
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
void iadst8_dspr2(const int16_t *input, int16_t *output);
void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
void iadst16_dspr2(const int16_t *input, int16_t *output);
#endif // #if HAVE_DSPR2
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c
index 0ec0c2059f4..44ba65c7ac8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c
@@ -389,7 +389,7 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
}
}
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
int i;
int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
int step1_8, step1_9, step1_10, step1_11;
@@ -712,14 +712,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
"add %[load6], %[step1_1], %[step1_6] \n\t"
"add %[load6], %[load6], %[step1_14] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"add %[load5], %[step1_2], %[step1_5] \n\t"
@@ -731,14 +731,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
"add %[load6], %[step1_3], %[step1_4] \n\t"
"add %[load6], %[load6], %[step1_12] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"sub %[load5], %[step1_3], %[step1_4] \n\t"
@@ -750,14 +750,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
"sub %[load6], %[step1_2], %[step1_5] \n\t"
"add %[load6], %[load6], %[step1_10] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"sub %[load5], %[step1_1], %[step1_6] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
@@ -769,14 +769,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
"sub %[load6], %[step1_0], %[step1_7] \n\t"
"add %[load6], %[load6], %[step1_8] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"sub %[load5], %[step1_0], %[step1_7] \n\t"
@@ -788,14 +788,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
"sub %[load6], %[step1_1], %[step1_6] \n\t"
"sub %[load6], %[load6], %[step1_9] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"sub %[load5], %[step1_2], %[step1_5] \n\t"
@@ -807,14 +807,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
"sub %[load6], %[step1_3], %[step1_4] \n\t"
"sub %[load6], %[load6], %[step1_11] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"add %[load5], %[step1_3], %[step1_4] \n\t"
@@ -826,14 +826,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
"add %[load6], %[step1_2], %[step1_5] \n\t"
"sub %[load6], %[load6], %[step1_13] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
"add %[load8], %[load8], %[load6] \n\t"
"lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load7], 0(%[dest_pix]) \n\t"
"add %[load5], %[step1_1], %[step1_6] \n\t"
@@ -845,7 +845,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
"add %[load6], %[step1_0], %[step1_7] \n\t"
"sub %[load6], %[load6], %[step1_15] \n\t"
"sb %[load5], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[load8], 0(%[dest_pix]) \n\t"
"addi %[load6], %[load6], 32 \n\t"
"sra %[load6], %[load6], 6 \n\t"
@@ -856,7 +856,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
: [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
[load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
:
- [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+ [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
[step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
[step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
[step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
@@ -869,7 +869,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
}
void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
uint32_t pos = 45;
@@ -880,11 +880,11 @@ void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
idct16_rows_dspr2(input, out, 16);
// Then transform columns and add to dest
- idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+ idct16_cols_add_blk_dspr2(out, dest, stride);
}
void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
int16_t *outptr = out;
uint32_t i;
@@ -924,11 +924,11 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
}
// Then transform columns
- idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+ idct16_cols_add_blk_dspr2(out, dest, stride);
}
void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
uint32_t pos = 45;
int32_t out;
int32_t r;
@@ -975,13 +975,54 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
"sw %[vector_2], 4(%[dest]) \n\t"
"sw %[vector_3], 8(%[dest]) \n\t"
"sw %[vector_4], 12(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
[vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
[vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
[dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ } else if (a1 > 255) {
+ int32_t a11, a12, vector_a11, vector_a12;
+
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ a11 = a1 >> 1;
+ a12 = a1 - a11;
+ __asm__ __volatile__(
+ "replv.qb %[vector_a11], %[a11] \n\t"
+ "replv.qb %[vector_a12], %[a12] \n\t"
+
+ : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+ : [a11] "r"(a11), [a12] "r"(a12));
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+ [vector_a12] "r"(vector_a12));
}
} else {
/* use quad-byte
@@ -1005,13 +1046,13 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
"sw %[vector_2], 4(%[dest]) \n\t"
"sw %[vector_3], 8(%[dest]) \n\t"
"sw %[vector_4], 12(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
[vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
[vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
[dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
index ce25d55c9c0..3f043b48baf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
@@ -13,26 +13,25 @@
#include "vpx_dsp/txfm_common.h"
#if HAVE_DSPR2
-void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride) {
- int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
- int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
- int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
- int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
- int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
- int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
- int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
- int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
- int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
- int16_t step2_28, step2_29, step2_30, step2_31;
- int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
- int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
- int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
- int16_t step3_28, step3_29, step3_30, step3_31;
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+ int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+ int step1_28, step1_29, step1_30, step1_31;
+ int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int step2_28, step2_29, step2_30, step2_31;
+ int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+ int step3_29, step3_30, step3_31;
int temp0, temp1, temp2, temp3;
int load1, load2, load3, load4;
int result1, result2;
- int i, temp21;
+ int i;
uint8_t *dest_pix, *dest_pix1;
const int const_2_power_13 = 8192;
uint8_t *cm = vpx_ff_cropTbl;
@@ -49,7 +48,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
for (i = 0; i < 32; ++i) {
dest_pix = dest + i;
- dest_pix1 = dest + i + 31 * dest_stride;
+ dest_pix1 = dest + i + 31 * stride;
__asm__ __volatile__(
"lh %[load1], 2(%[input]) \n\t"
@@ -103,9 +102,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
- [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
- [step1_31] "=r"(step1_31)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
+ [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
[cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
@@ -163,9 +162,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
- [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
- [step1_29] "=r"(step1_29)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
+ [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
[cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
@@ -223,9 +222,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
- [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
- [step1_27] "=r"(step1_27)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
+ [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
[cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
@@ -279,9 +278,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
- [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
- [step1_25] "=r"(step1_25)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
+ [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
[cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
@@ -335,9 +334,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
- [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
- [step2_15] "=r"(step2_15)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
+ [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
+ [step2_15] "=&r"(step2_15)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
[cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
@@ -391,9 +390,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
- [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
- [step2_13] "=r"(step2_13)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
+ [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
[cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
@@ -434,116 +433,154 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"extp %[step3_11], $ac2, 31 \n\t"
"extp %[step3_12], $ac3, 31 \n\t"
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
- [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
- [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
- [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
- [step3_15] "=r"(step3_15)
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
+ [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
+ [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
+ [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
+ [step3_15] "=&r"(step3_15)
: [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
[step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
[step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
[step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
[step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
- step2_18 = step1_17 - step1_18;
- step2_29 = step1_30 - step1_29;
-
__asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
- "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
- "extp %[step3_18], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_17], %[step1_18] \n\t"
+ "sub %[temp1], %[step1_30], %[step1_29] \n\t"
+ "add %[step3_17], %[step1_17], %[step1_18] \n\t"
+ "add %[step3_30], %[step1_30], %[step1_29] \n\t"
- : [step3_18] "=r"(step3_18)
- : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
- [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+ "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_29], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
+ [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
+ [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
+ [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
[cospi_8_64] "r"(cospi_8_64));
- temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
- step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step2_19 = step1_16 - step1_19;
- step2_28 = step1_31 - step1_28;
-
__asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
- "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
- "extp %[step3_19], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_16], %[step1_19] \n\t"
+ "sub %[temp1], %[step1_31], %[step1_28] \n\t"
+ "add %[step3_16], %[step1_16], %[step1_19] \n\t"
+ "add %[step3_31], %[step1_31], %[step1_28] \n\t"
- : [step3_19] "=r"(step3_19)
- : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
- [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+ "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_28], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
+ [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
+ [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
+ [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
[cospi_8_64] "r"(cospi_8_64));
- temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
- step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step3_16 = step1_16 + step1_19;
- step3_17 = step1_17 + step1_18;
- step3_30 = step1_29 + step1_30;
- step3_31 = step1_28 + step1_31;
-
- step2_20 = step1_23 - step1_20;
- step2_27 = step1_24 - step1_27;
-
__asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
- "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
- "extp %[step3_20], $ac0, 31 \n\t"
-
- : [step3_20] "=r"(step3_20)
- : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
- [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_23], %[step1_20] \n\t"
+ "sub %[temp1], %[step1_24], %[step1_27] \n\t"
+ "add %[step3_23], %[step1_23], %[step1_20] \n\t"
+ "add %[step3_24], %[step1_24], %[step1_27] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_27], $ac0, 31 \n\t"
+ "msub $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "msub $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
+ [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
+ [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
+ [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
[cospi_8_64] "r"(cospi_8_64));
- temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
- step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step2_21 = step1_22 - step1_21;
- step2_26 = step1_25 - step1_26;
-
__asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
"mtlo %[const_2_power_13], $ac1 \n\t"
"mthi $zero, $ac1 \n\t"
- "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
- "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
- "extp %[step3_21], $ac1, 31 \n\t"
-
- : [step3_21] "=r"(step3_21)
- : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
- [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+ "sub %[temp0], %[step1_22], %[step1_21] \n\t"
+ "sub %[temp1], %[step1_25], %[step1_26] \n\t"
+ "add %[step3_22], %[step1_22], %[step1_21] \n\t"
+ "add %[step3_25], %[step1_25], %[step1_26] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac0, 31 \n\t"
+ "msub $ac1, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_26], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
+ [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
+ [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
+ [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
[cospi_8_64] "r"(cospi_8_64));
- temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
- step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step3_22 = step1_21 + step1_22;
- step3_23 = step1_20 + step1_23;
- step3_24 = step1_24 + step1_27;
- step3_25 = step1_25 + step1_26;
-
- step2_16 = step3_16 + step3_23;
- step2_17 = step3_17 + step3_22;
- step2_18 = step3_18 + step3_21;
- step2_19 = step3_19 + step3_20;
- step2_20 = step3_19 - step3_20;
- step2_21 = step3_18 - step3_21;
- step2_22 = step3_17 - step3_22;
- step2_23 = step3_16 - step3_23;
-
- step2_24 = step3_31 - step3_24;
- step2_25 = step3_30 - step3_25;
- step2_26 = step3_29 - step3_26;
- step2_27 = step3_28 - step3_27;
- step2_28 = step3_28 + step3_27;
- step2_29 = step3_29 + step3_26;
- step2_30 = step3_30 + step3_25;
- step2_31 = step3_31 + step3_24;
+ __asm__ __volatile__(
+ "add %[step2_16], %[step3_16], %[step3_23] \n\t"
+ "add %[step2_17], %[step3_17], %[step3_22] \n\t"
+ "add %[step2_18], %[step3_18], %[step3_21] \n\t"
+ "add %[step2_19], %[step3_19], %[step3_20] \n\t"
+ "sub %[step2_20], %[step3_19], %[step3_20] \n\t"
+ "sub %[step2_21], %[step3_18], %[step3_21] \n\t"
+ "sub %[step2_22], %[step3_17], %[step3_22] \n\t"
+ "sub %[step2_23], %[step3_16], %[step3_23] \n\t"
+
+ : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
+ [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
+ [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
+ [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
+ : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
+ [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
+ [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
+ [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
+
+ __asm__ __volatile__(
+ "sub %[step2_24], %[step3_31], %[step3_24] \n\t"
+ "sub %[step2_25], %[step3_30], %[step3_25] \n\t"
+ "sub %[step2_26], %[step3_29], %[step3_26] \n\t"
+ "sub %[step2_27], %[step3_28], %[step3_27] \n\t"
+ "add %[step2_28], %[step3_28], %[step3_27] \n\t"
+ "add %[step2_29], %[step3_29], %[step3_26] \n\t"
+ "add %[step2_30], %[step3_30], %[step3_25] \n\t"
+ "add %[step2_31], %[step3_31], %[step3_24] \n\t"
+
+ : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
+ [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
+ [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
+ [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
+ : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
+ [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
+ [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
+ [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
__asm__ __volatile__(
"lh %[load1], 0(%[input]) \n\t"
@@ -580,9 +617,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [result1] "=&r"(result1),
[result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
- [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
- [step1_3] "=r"(step1_3)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
+ [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
+ [step1_3] "=&r"(step1_3)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
[cospi_16_64] "r"(cospi_16_64));
@@ -638,96 +675,137 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
- [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
- [step1_7] "=r"(step1_7)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
+ [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
+ [step1_7] "=&r"(step1_7)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
[cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
[cospi_16_64] "r"(cospi_16_64));
- step2_0 = step1_0 + step1_7;
- step2_1 = step1_1 + step1_6;
- step2_2 = step1_2 + step1_5;
- step2_3 = step1_3 + step1_4;
- step2_4 = step1_3 - step1_4;
- step2_5 = step1_2 - step1_5;
- step2_6 = step1_1 - step1_6;
- step2_7 = step1_0 - step1_7;
+ __asm__ __volatile__(
+ "add %[step2_0], %[step1_0], %[step1_7] \n\t"
+ "add %[step2_1], %[step1_1], %[step1_6] \n\t"
+ "add %[step2_2], %[step1_2], %[step1_5] \n\t"
+ "add %[step2_3], %[step1_3], %[step1_4] \n\t"
+ "sub %[step2_4], %[step1_3], %[step1_4] \n\t"
+ "sub %[step2_5], %[step1_2], %[step1_5] \n\t"
+ "sub %[step2_6], %[step1_1], %[step1_6] \n\t"
+ "sub %[step2_7], %[step1_0], %[step1_7] \n\t"
+
+ : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
+ [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
+ [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
+ [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
+ : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
+ [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
+ [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
+ [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
// stage 7
- step1_0 = step2_0 + step3_15;
- step1_1 = step2_1 + step3_14;
- step1_2 = step2_2 + step3_13;
- step1_3 = step2_3 + step3_12;
- step1_4 = step2_4 + step3_11;
- step1_5 = step2_5 + step3_10;
- step1_6 = step2_6 + step3_9;
- step1_7 = step2_7 + step3_8;
- step1_8 = step2_7 - step3_8;
- step1_9 = step2_6 - step3_9;
- step1_10 = step2_5 - step3_10;
- step1_11 = step2_4 - step3_11;
- step1_12 = step2_3 - step3_12;
- step1_13 = step2_2 - step3_13;
- step1_14 = step2_1 - step3_14;
- step1_15 = step2_0 - step3_15;
-
__asm__ __volatile__(
- "sub %[temp0], %[step2_27], %[step2_20] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_20], $ac0, 31 \n\t"
+ "add %[step1_0], %[step2_0], %[step3_15] \n\t"
+ "add %[step1_1], %[step2_1], %[step3_14] \n\t"
+ "add %[step1_2], %[step2_2], %[step3_13] \n\t"
+ "add %[step1_3], %[step2_3], %[step3_12] \n\t"
+ "sub %[step1_12], %[step2_3], %[step3_12] \n\t"
+ "sub %[step1_13], %[step2_2], %[step3_13] \n\t"
+ "sub %[step1_14], %[step2_1], %[step3_14] \n\t"
+ "sub %[step1_15], %[step2_0], %[step3_15] \n\t"
+
+ : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
+ [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
+ [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
+ [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
+ : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
+ [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
+ [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
+ [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
- : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
- : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
- [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
-
- temp21 = (step2_20 + step2_27) * cospi_16_64;
- step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+ __asm__ __volatile__(
+ "add %[step1_4], %[step2_4], %[step3_11] \n\t"
+ "add %[step1_5], %[step2_5], %[step3_10] \n\t"
+ "add %[step1_6], %[step2_6], %[step3_9] \n\t"
+ "add %[step1_7], %[step2_7], %[step3_8] \n\t"
+ "sub %[step1_8], %[step2_7], %[step3_8] \n\t"
+ "sub %[step1_9], %[step2_6], %[step3_9] \n\t"
+ "sub %[step1_10], %[step2_5], %[step3_10] \n\t"
+ "sub %[step1_11], %[step2_4], %[step3_11] \n\t"
+
+ : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
+ [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
+ [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
+ [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
+ : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
+ [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
+ [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
+ [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
__asm__ __volatile__(
- "sub %[temp0], %[step2_26], %[step2_21] \n\t"
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "add %[temp1], %[step2_27], %[step2_20] \n\t"
+ "sub %[temp2], %[step2_26], %[step2_21] \n\t"
+ "add %[temp3], %[step2_26], %[step2_21] \n\t"
+
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_21], $ac0, 31 \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
+ "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
- : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
- : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
- [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
+ "extp %[step1_20], $ac0, 31 \n\t"
+ "extp %[step1_27], $ac1, 31 \n\t"
+ "extp %[step1_21], $ac2, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
- temp21 = (step2_21 + step2_26) * cospi_16_64;
- step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
+ [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
+ [step1_26] "=&r"(step1_26)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
+ [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
__asm__ __volatile__(
"sub %[temp0], %[step2_25], %[step2_22] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_22], $ac0, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
- : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
- [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
+ "add %[temp1], %[step2_25], %[step2_22] \n\t"
+ "sub %[temp2], %[step2_24], %[step2_23] \n\t"
+ "add %[temp3], %[step2_24], %[step2_23] \n\t"
- temp21 = (step2_22 + step2_25) * cospi_16_64;
- step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- __asm__ __volatile__(
- "sub %[temp0], %[step2_24], %[step2_23] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_23], $ac0, 31 \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
+ "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
- : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
- : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
- [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
+ "extp %[step1_22], $ac0, 31 \n\t"
+ "extp %[step1_25], $ac1, 31 \n\t"
+ "extp %[step1_23], $ac2, 31 \n\t"
+ "extp %[step1_24], $ac3, 31 \n\t"
- temp21 = (step2_23 + step2_24) * cospi_16_64;
- step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
+ [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
+ [step1_24] "=&r"(step1_24)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
+ [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
+ [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
__asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t"
@@ -738,14 +816,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_1], %[step2_30] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_2], %[step2_29] \n\t"
@@ -755,18 +833,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_3], %[step2_28] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
[step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
[step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
[step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
@@ -782,29 +860,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_14] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_13] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_12] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
__asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t"
@@ -815,14 +893,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_5], %[step1_26] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_6], %[step1_25] \n\t"
@@ -832,18 +910,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_7], %[step1_24] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4),
[step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
[step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
[step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
@@ -859,29 +937,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_14] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_13] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_12] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
__asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t"
@@ -892,14 +970,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_9], %[step1_22] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_10], %[step1_21] \n\t"
@@ -909,18 +987,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_11], %[step1_20] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8),
[step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
[step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
[step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
@@ -936,29 +1014,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_14] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_13] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_12] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
__asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t"
@@ -969,14 +1047,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_13], %[step2_18] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
"add %[temp3], %[temp3], %[temp1] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_14], %[step2_17] \n\t"
@@ -986,7 +1064,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"add %[temp1], %[step1_15], %[step2_16] \n\t"
"sb %[temp0], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix]) \n\t"
"addi %[temp1], %[temp1], 32 \n\t"
"sra %[temp1], %[temp1], 6 \n\t"
@@ -996,11 +1074,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
- [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
- [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
- [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
+ : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12),
+ [step1_13] "r"(step1_13), [step1_14] "r"(step1_14),
+ [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
+ [step2_17] "r"(step2_17), [step2_18] "r"(step2_18),
+ [step2_19] "r"(step2_19));
step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
@@ -1012,18 +1090,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_14] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_13] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t"
"sb %[temp0], 0(%[dest_pix1]) \n\t"
- "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
"lbu %[temp3], 0(%[dest_pix1]) \n\t"
"add %[temp3], %[temp3], %[step3_12] \n\t"
"lbux %[temp1], %[temp3](%[cm]) \n\t"
@@ -1031,9 +1109,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
: [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
- : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
- [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
- [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+ : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+ [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+ [step3_15] "r"(step3_15));
input += 32;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c
index d71c5ffed51..3c0468c00fa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c
@@ -18,24 +18,23 @@
#if HAVE_DSPR2
static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
uint32_t no_rows) {
- int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
- int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
- int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
- int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
- int16_t step1_28, step1_29, step1_30, step1_31;
- int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
- int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
- int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
- int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
- int16_t step2_28, step2_29, step2_30, step2_31;
- int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
- int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
- int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
- int16_t step3_29, step3_30, step3_31;
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+ int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+ int step1_28, step1_29, step1_30, step1_31;
+ int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int step2_28, step2_29, step2_30, step2_31;
+ int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+ int step3_29, step3_30, step3_31;
int temp0, temp1, temp2, temp3;
int load1, load2, load3, load4;
int result1, result2;
- int temp21;
int i;
const int const_2_power_13 = 8192;
const int32_t *input_int;
@@ -147,9 +146,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
- [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
- [step1_31] "=r"(step1_31)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
+ [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
[cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
@@ -207,9 +206,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
- [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
- [step1_29] "=r"(step1_29)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
+ [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
[cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
@@ -267,9 +266,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
- [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
- [step1_27] "=r"(step1_27)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
+ [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
[cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
@@ -289,7 +288,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"madd $ac1, %[load1], %[cospi_19_64] \n\t"
"msub $ac1, %[load2], %[cospi_13_64] \n\t"
"extp %[temp0], $ac1, 31 \n\t"
-
"madd $ac3, %[load1], %[cospi_13_64] \n\t"
"madd $ac3, %[load2], %[cospi_19_64] \n\t"
"extp %[temp3], $ac3, 31 \n\t"
@@ -302,7 +300,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"madd $ac2, %[load3], %[cospi_3_64] \n\t"
"msub $ac2, %[load4], %[cospi_29_64] \n\t"
"extp %[temp1], $ac2, 31 \n\t"
-
"madd $ac1, %[load3], %[cospi_29_64] \n\t"
"madd $ac1, %[load4], %[cospi_3_64] \n\t"
"extp %[temp2], $ac1, 31 \n\t"
@@ -314,12 +311,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"sub %[load1], %[temp1], %[temp0] \n\t"
"sub %[load2], %[temp2], %[temp3] \n\t"
-
"msub $ac1, %[load1], %[cospi_12_64] \n\t"
"msub $ac1, %[load2], %[cospi_20_64] \n\t"
"msub $ac3, %[load1], %[cospi_20_64] \n\t"
"madd $ac3, %[load2], %[cospi_12_64] \n\t"
-
"extp %[step1_22], $ac1, 31 \n\t"
"extp %[step1_25], $ac3, 31 \n\t"
"add %[step1_23], %[temp0], %[temp1] \n\t"
@@ -327,9 +322,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
- [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
- [step1_25] "=r"(step1_25)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
+ [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
[cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
@@ -349,7 +344,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"madd $ac1, %[load1], %[cospi_30_64] \n\t"
"msub $ac1, %[load2], %[cospi_2_64] \n\t"
"extp %[temp0], $ac1, 31 \n\t"
-
"madd $ac3, %[load1], %[cospi_2_64] \n\t"
"madd $ac3, %[load2], %[cospi_30_64] \n\t"
"extp %[temp3], $ac3, 31 \n\t"
@@ -362,7 +356,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"madd $ac2, %[load3], %[cospi_14_64] \n\t"
"msub $ac2, %[load4], %[cospi_18_64] \n\t"
"extp %[temp1], $ac2, 31 \n\t"
-
"madd $ac1, %[load3], %[cospi_18_64] \n\t"
"madd $ac1, %[load4], %[cospi_14_64] \n\t"
"extp %[temp2], $ac1, 31 \n\t"
@@ -374,12 +367,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"sub %[load1], %[temp0], %[temp1] \n\t"
"sub %[load2], %[temp3], %[temp2] \n\t"
-
"msub $ac1, %[load1], %[cospi_8_64] \n\t"
"madd $ac1, %[load2], %[cospi_24_64] \n\t"
"madd $ac3, %[load1], %[cospi_24_64] \n\t"
"madd $ac3, %[load2], %[cospi_8_64] \n\t"
-
"extp %[step2_9], $ac1, 31 \n\t"
"extp %[step2_14], $ac3, 31 \n\t"
"add %[step2_8], %[temp0], %[temp1] \n\t"
@@ -387,9 +378,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
- [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
- [step2_15] "=r"(step2_15)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
+ [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
+ [step2_15] "=&r"(step2_15)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
[cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
@@ -409,7 +400,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"madd $ac1, %[load1], %[cospi_22_64] \n\t"
"msub $ac1, %[load2], %[cospi_10_64] \n\t"
"extp %[temp0], $ac1, 31 \n\t"
-
"madd $ac3, %[load1], %[cospi_10_64] \n\t"
"madd $ac3, %[load2], %[cospi_22_64] \n\t"
"extp %[temp3], $ac3, 31 \n\t"
@@ -422,7 +412,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"madd $ac2, %[load3], %[cospi_6_64] \n\t"
"msub $ac2, %[load4], %[cospi_26_64] \n\t"
"extp %[temp1], $ac2, 31 \n\t"
-
"madd $ac1, %[load3], %[cospi_26_64] \n\t"
"madd $ac1, %[load4], %[cospi_6_64] \n\t"
"extp %[temp2], $ac1, 31 \n\t"
@@ -434,12 +423,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"sub %[load1], %[temp1], %[temp0] \n\t"
"sub %[load2], %[temp2], %[temp3] \n\t"
-
"msub $ac1, %[load1], %[cospi_24_64] \n\t"
"msub $ac1, %[load2], %[cospi_8_64] \n\t"
"madd $ac3, %[load2], %[cospi_24_64] \n\t"
"msub $ac3, %[load1], %[cospi_8_64] \n\t"
-
"extp %[step2_10], $ac1, 31 \n\t"
"extp %[step2_13], $ac3, 31 \n\t"
"add %[step2_11], %[temp0], %[temp1] \n\t"
@@ -447,9 +434,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
- [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
- [step2_13] "=r"(step2_13)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+ [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
+ [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
[cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
@@ -462,21 +449,18 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"sub %[temp0], %[temp0], %[step2_9] \n\t"
"add %[temp0], %[temp0], %[step2_10] \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
-
"mtlo %[const_2_power_13], $ac1 \n\t"
"mthi $zero, $ac1 \n\t"
"sub %[temp1], %[step2_14], %[step2_13] \n\t"
"add %[temp1], %[temp1], %[step2_9] \n\t"
"sub %[temp1], %[temp1], %[step2_10] \n\t"
"madd $ac1, %[temp1], %[cospi_16_64] \n\t"
-
"mtlo %[const_2_power_13], $ac2 \n\t"
"mthi $zero, $ac2 \n\t"
"sub %[temp0], %[step2_15], %[step2_12] \n\t"
"sub %[temp0], %[temp0], %[step2_8] \n\t"
"add %[temp0], %[temp0], %[step2_11] \n\t"
"madd $ac2, %[temp0], %[cospi_16_64] \n\t"
-
"mtlo %[const_2_power_13], $ac3 \n\t"
"mthi $zero, $ac3 \n\t"
"sub %[temp1], %[step2_15], %[step2_12] \n\t"
@@ -488,122 +472,159 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step3_9], %[step2_9], %[step2_10] \n\t"
"add %[step3_14], %[step2_13], %[step2_14] \n\t"
"add %[step3_15], %[step2_12], %[step2_15] \n\t"
-
"extp %[step3_10], $ac0, 31 \n\t"
"extp %[step3_13], $ac1, 31 \n\t"
"extp %[step3_11], $ac2, 31 \n\t"
"extp %[step3_12], $ac3, 31 \n\t"
- : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
- [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
- [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
- [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
- [step3_15] "=r"(step3_15)
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
+ [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
+ [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
+ [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
+ [step3_15] "=&r"(step3_15)
: [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
[step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
[step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
[step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
[step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
- step2_18 = step1_17 - step1_18;
- step2_29 = step1_30 - step1_29;
-
__asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
- "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
- "extp %[step3_18], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_17], %[step1_18] \n\t"
+ "sub %[temp1], %[step1_30], %[step1_29] \n\t"
+ "add %[step3_17], %[step1_17], %[step1_18] \n\t"
+ "add %[step3_30], %[step1_30], %[step1_29] \n\t"
- : [step3_18] "=r"(step3_18)
- : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
- [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+ "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_29], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
+ [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
+ [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
+ [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
[cospi_8_64] "r"(cospi_8_64));
- temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
- step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step2_19 = step1_16 - step1_19;
- step2_28 = step1_31 - step1_28;
-
__asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
- "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
- "extp %[step3_19], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_16], %[step1_19] \n\t"
+ "sub %[temp1], %[step1_31], %[step1_28] \n\t"
+ "add %[step3_16], %[step1_16], %[step1_19] \n\t"
+ "add %[step3_31], %[step1_31], %[step1_28] \n\t"
- : [step3_19] "=r"(step3_19)
- : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
- [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+ "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_28], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
+ [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
+ [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
+ [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
[cospi_8_64] "r"(cospi_8_64));
- temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
- step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step3_16 = step1_16 + step1_19;
- step3_17 = step1_17 + step1_18;
- step3_30 = step1_29 + step1_30;
- step3_31 = step1_28 + step1_31;
-
- step2_20 = step1_23 - step1_20;
- step2_27 = step1_24 - step1_27;
-
__asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
- "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
- "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
- "extp %[step3_20], $ac0, 31 \n\t"
-
- : [step3_20] "=r"(step3_20)
- : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
- [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp0], %[step1_23], %[step1_20] \n\t"
+ "sub %[temp1], %[step1_24], %[step1_27] \n\t"
+ "add %[step3_23], %[step1_23], %[step1_20] \n\t"
+ "add %[step3_24], %[step1_24], %[step1_27] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_27], $ac0, 31 \n\t"
+ "msub $ac1, %[temp0], %[cospi_24_64] \n\t"
+ "msub $ac1, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
+ [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
+ [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
+ [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
[cospi_8_64] "r"(cospi_8_64));
- temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
- step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step2_21 = step1_22 - step1_21;
- step2_26 = step1_25 - step1_26;
-
__asm__ __volatile__(
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
"mtlo %[const_2_power_13], $ac1 \n\t"
"mthi $zero, $ac1 \n\t"
- "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
- "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
- "extp %[step3_21], $ac1, 31 \n\t"
-
- : [step3_21] "=r"(step3_21)
- : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
- [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+ "sub %[temp0], %[step1_22], %[step1_21] \n\t"
+ "sub %[temp1], %[step1_25], %[step1_26] \n\t"
+ "add %[step3_22], %[step1_22], %[step1_21] \n\t"
+ "add %[step3_25], %[step1_25], %[step1_26] \n\t"
+
+ "msub $ac0, %[temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[temp1], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac0, 31 \n\t"
+ "msub $ac1, %[temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_24_64] \n\t"
+ "extp %[step3_26], $ac1, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+ [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
+ [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
+ : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
+ [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
+ [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
[cospi_8_64] "r"(cospi_8_64));
- temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
- step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
- step3_22 = step1_21 + step1_22;
- step3_23 = step1_20 + step1_23;
- step3_24 = step1_24 + step1_27;
- step3_25 = step1_25 + step1_26;
-
- step2_16 = step3_16 + step3_23;
- step2_17 = step3_17 + step3_22;
- step2_18 = step3_18 + step3_21;
- step2_19 = step3_19 + step3_20;
- step2_20 = step3_19 - step3_20;
- step2_21 = step3_18 - step3_21;
- step2_22 = step3_17 - step3_22;
- step2_23 = step3_16 - step3_23;
-
- step2_24 = step3_31 - step3_24;
- step2_25 = step3_30 - step3_25;
- step2_26 = step3_29 - step3_26;
- step2_27 = step3_28 - step3_27;
- step2_28 = step3_28 + step3_27;
- step2_29 = step3_29 + step3_26;
- step2_30 = step3_30 + step3_25;
- step2_31 = step3_31 + step3_24;
+ __asm__ __volatile__(
+ "add %[step2_16], %[step3_16], %[step3_23] \n\t"
+ "add %[step2_17], %[step3_17], %[step3_22] \n\t"
+ "add %[step2_18], %[step3_18], %[step3_21] \n\t"
+ "add %[step2_19], %[step3_19], %[step3_20] \n\t"
+ "sub %[step2_20], %[step3_19], %[step3_20] \n\t"
+ "sub %[step2_21], %[step3_18], %[step3_21] \n\t"
+ "sub %[step2_22], %[step3_17], %[step3_22] \n\t"
+ "sub %[step2_23], %[step3_16], %[step3_23] \n\t"
+
+ : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
+ [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
+ [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
+ [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
+ : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
+ [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
+ [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
+ [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
+
+ __asm__ __volatile__(
+ "sub %[step2_24], %[step3_31], %[step3_24] \n\t"
+ "sub %[step2_25], %[step3_30], %[step3_25] \n\t"
+ "sub %[step2_26], %[step3_29], %[step3_26] \n\t"
+ "sub %[step2_27], %[step3_28], %[step3_27] \n\t"
+ "add %[step2_28], %[step3_28], %[step3_27] \n\t"
+ "add %[step2_29], %[step3_29], %[step3_26] \n\t"
+ "add %[step2_30], %[step3_30], %[step3_25] \n\t"
+ "add %[step2_31], %[step3_31], %[step3_24] \n\t"
+
+ : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
+ [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
+ [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
+ [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
+ : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
+ [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
+ [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
+ [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
__asm__ __volatile__(
"lh %[load1], 0(%[input]) \n\t"
@@ -627,29 +648,25 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"madd $ac3, %[load3], %[cospi_24_64] \n\t"
"msub $ac3, %[load4], %[cospi_8_64] \n\t"
"extp %[temp2], $ac3, 31 \n\t"
-
"mtlo %[const_2_power_13], $ac1 \n\t"
"mthi $zero, $ac1 \n\t"
"madd $ac1, %[load3], %[cospi_8_64] \n\t"
"madd $ac1, %[load4], %[cospi_24_64] \n\t"
"extp %[temp3], $ac1, 31 \n\t"
-
- "add %[step1_0], %[temp0], %[temp3] \n\t"
- "add %[step1_1], %[temp1], %[temp2] \n\t"
- "sub %[step1_2], %[temp1], %[temp2] \n\t"
- "sub %[step1_3], %[temp0], %[temp3] \n\t"
+ "add %[step1_0], %[temp0], %[temp3] \n\t"
+ "add %[step1_1], %[temp1], %[temp2] \n\t"
+ "sub %[step1_2], %[temp1], %[temp2] \n\t"
+ "sub %[step1_3], %[temp0], %[temp3] \n\t"
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [result1] "=&r"(result1),
[result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
- [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
- [step1_3] "=r"(step1_3)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
+ [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
+ [step1_3] "=&r"(step1_3)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
- [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64),
- [cospi_8_64] "r"(cospi_8_64)
-
- );
+ [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+ [cospi_16_64] "r"(cospi_16_64));
__asm__ __volatile__(
"lh %[load1], 8(%[input]) \n\t"
@@ -665,7 +682,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"madd $ac1, %[load1], %[cospi_28_64] \n\t"
"msub $ac1, %[load2], %[cospi_4_64] \n\t"
"extp %[temp0], $ac1, 31 \n\t"
-
"madd $ac3, %[load1], %[cospi_4_64] \n\t"
"madd $ac3, %[load2], %[cospi_28_64] \n\t"
"extp %[temp3], $ac3, 31 \n\t"
@@ -678,7 +694,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"madd $ac2, %[load3], %[cospi_12_64] \n\t"
"msub $ac2, %[load4], %[cospi_20_64] \n\t"
"extp %[temp1], $ac2, 31 \n\t"
-
"madd $ac1, %[load3], %[cospi_20_64] \n\t"
"madd $ac1, %[load4], %[cospi_12_64] \n\t"
"extp %[temp2], $ac1, 31 \n\t"
@@ -691,11 +706,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"sub %[load1], %[temp3], %[temp2] \n\t"
"sub %[load1], %[load1], %[temp0] \n\t"
"add %[load1], %[load1], %[temp1] \n\t"
-
"sub %[load2], %[temp0], %[temp1] \n\t"
"sub %[load2], %[load2], %[temp2] \n\t"
"add %[load2], %[load2], %[temp3] \n\t"
-
"madd $ac1, %[load1], %[cospi_16_64] \n\t"
"madd $ac3, %[load2], %[cospi_16_64] \n\t"
@@ -706,129 +719,246 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
: [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
- [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
- [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
- [step1_7] "=r"(step1_7)
+ [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
+ [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
+ [step1_7] "=&r"(step1_7)
: [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
[cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
[cospi_16_64] "r"(cospi_16_64));
- step2_0 = step1_0 + step1_7;
- step2_1 = step1_1 + step1_6;
- step2_2 = step1_2 + step1_5;
- step2_3 = step1_3 + step1_4;
- step2_4 = step1_3 - step1_4;
- step2_5 = step1_2 - step1_5;
- step2_6 = step1_1 - step1_6;
- step2_7 = step1_0 - step1_7;
-
- step1_0 = step2_0 + step3_15;
- step1_1 = step2_1 + step3_14;
- step1_2 = step2_2 + step3_13;
- step1_3 = step2_3 + step3_12;
- step1_4 = step2_4 + step3_11;
- step1_5 = step2_5 + step3_10;
- step1_6 = step2_6 + step3_9;
- step1_7 = step2_7 + step3_8;
- step1_8 = step2_7 - step3_8;
- step1_9 = step2_6 - step3_9;
- step1_10 = step2_5 - step3_10;
- step1_11 = step2_4 - step3_11;
- step1_12 = step2_3 - step3_12;
- step1_13 = step2_2 - step3_13;
- step1_14 = step2_1 - step3_14;
- step1_15 = step2_0 - step3_15;
-
__asm__ __volatile__(
- "sub %[temp0], %[step2_27], %[step2_20] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_20], $ac0, 31 \n\t"
-
- : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
- : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
- [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
+ "add %[step2_0], %[step1_0], %[step1_7] \n\t"
+ "add %[step2_1], %[step1_1], %[step1_6] \n\t"
+ "add %[step2_2], %[step1_2], %[step1_5] \n\t"
+ "add %[step2_3], %[step1_3], %[step1_4] \n\t"
+ "sub %[step2_4], %[step1_3], %[step1_4] \n\t"
+ "sub %[step2_5], %[step1_2], %[step1_5] \n\t"
+ "sub %[step2_6], %[step1_1], %[step1_6] \n\t"
+ "sub %[step2_7], %[step1_0], %[step1_7] \n\t"
+
+ : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
+ [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
+ [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
+ [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
+ : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
+ [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
+ [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
+ [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
+
+ // stage 7
+ __asm__ __volatile__(
+ "add %[step1_0], %[step2_0], %[step3_15] \n\t"
+ "add %[step1_1], %[step2_1], %[step3_14] \n\t"
+ "add %[step1_2], %[step2_2], %[step3_13] \n\t"
+ "add %[step1_3], %[step2_3], %[step3_12] \n\t"
+ "sub %[step1_12], %[step2_3], %[step3_12] \n\t"
+ "sub %[step1_13], %[step2_2], %[step3_13] \n\t"
+ "sub %[step1_14], %[step2_1], %[step3_14] \n\t"
+ "sub %[step1_15], %[step2_0], %[step3_15] \n\t"
+
+ : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
+ [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
+ [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
+ [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
+ : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
+ [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
+ [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
+ [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
- temp21 = (step2_20 + step2_27) * cospi_16_64;
- step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+ __asm__ __volatile__(
+ "add %[step1_4], %[step2_4], %[step3_11] \n\t"
+ "add %[step1_5], %[step2_5], %[step3_10] \n\t"
+ "add %[step1_6], %[step2_6], %[step3_9] \n\t"
+ "add %[step1_7], %[step2_7], %[step3_8] \n\t"
+ "sub %[step1_8], %[step2_7], %[step3_8] \n\t"
+ "sub %[step1_9], %[step2_6], %[step3_9] \n\t"
+ "sub %[step1_10], %[step2_5], %[step3_10] \n\t"
+ "sub %[step1_11], %[step2_4], %[step3_11] \n\t"
+
+ : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
+ [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
+ [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
+ [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
+ : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
+ [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
+ [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
+ [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
__asm__ __volatile__(
- "sub %[temp0], %[step2_26], %[step2_21] \n\t"
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "add %[temp1], %[step2_27], %[step2_20] \n\t"
+ "sub %[temp2], %[step2_26], %[step2_21] \n\t"
+ "add %[temp3], %[step2_26], %[step2_21] \n\t"
+
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
"madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_21], $ac0, 31 \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
+ "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
- : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
- : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
- [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
+ "extp %[step1_20], $ac0, 31 \n\t"
+ "extp %[step1_27], $ac1, 31 \n\t"
+ "extp %[step1_21], $ac2, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
- temp21 = (step2_21 + step2_26) * cospi_16_64;
- step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
+ [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
+ [step1_26] "=&r"(step1_26)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+ [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
+ [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
__asm__ __volatile__(
"sub %[temp0], %[step2_25], %[step2_22] \n\t"
+ "add %[temp1], %[step2_25], %[step2_22] \n\t"
+ "sub %[temp2], %[step2_24], %[step2_23] \n\t"
+ "add %[temp3], %[step2_24], %[step2_23] \n\t"
+
"mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_22], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
- : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
- : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
- [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
+ "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
- temp21 = (step2_22 + step2_25) * cospi_16_64;
- step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+ "extp %[step1_22], $ac0, 31 \n\t"
+ "extp %[step1_25], $ac1, 31 \n\t"
+ "extp %[step1_23], $ac2, 31 \n\t"
+ "extp %[step1_24], $ac3, 31 \n\t"
+
+ : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+ [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
+ [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
+ [step1_24] "=&r"(step1_24)
+ : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
+ [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
+ [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
+ // final stage
__asm__ __volatile__(
- "sub %[temp0], %[step2_24], %[step2_23] \n\t"
- "mtlo %[const_2_power_13], $ac0 \n\t"
- "mthi $zero, $ac0 \n\t"
- "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
- "extp %[step1_23], $ac0, 31 \n\t"
+ "add %[temp0], %[step1_0], %[step2_31] \n\t"
+ "add %[temp1], %[step1_1], %[step2_30] \n\t"
+ "add %[temp2], %[step1_2], %[step2_29] \n\t"
+ "add %[temp3], %[step1_3], %[step2_28] \n\t"
+ "sub %[load1], %[step1_3], %[step2_28] \n\t"
+ "sub %[load2], %[step1_2], %[step2_29] \n\t"
+ "sub %[load3], %[step1_1], %[step2_30] \n\t"
+ "sub %[load4], %[step1_0], %[step2_31] \n\t"
+ "sh %[temp0], 0(%[output]) \n\t"
+ "sh %[temp1], 64(%[output]) \n\t"
+ "sh %[temp2], 128(%[output]) \n\t"
+ "sh %[temp3], 192(%[output]) \n\t"
+ "sh %[load1], 1792(%[output]) \n\t"
+ "sh %[load2], 1856(%[output]) \n\t"
+ "sh %[load3], 1920(%[output]) \n\t"
+ "sh %[load4], 1984(%[output]) \n\t"
+
+ : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+ [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+ [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+ : [step1_0] "r"(step1_0), [step2_31] "r"(step2_31),
+ [step1_1] "r"(step1_1), [step2_30] "r"(step2_30),
+ [step1_2] "r"(step1_2), [step2_29] "r"(step2_29),
+ [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
+ [output] "r"(output));
- : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
- : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
- [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
+ __asm__ __volatile__(
+ "add %[temp0], %[step1_4], %[step1_27] \n\t"
+ "add %[temp1], %[step1_5], %[step1_26] \n\t"
+ "add %[temp2], %[step1_6], %[step1_25] \n\t"
+ "add %[temp3], %[step1_7], %[step1_24] \n\t"
+ "sub %[load1], %[step1_7], %[step1_24] \n\t"
+ "sub %[load2], %[step1_6], %[step1_25] \n\t"
+ "sub %[load3], %[step1_5], %[step1_26] \n\t"
+ "sub %[load4], %[step1_4], %[step1_27] \n\t"
+ "sh %[temp0], 256(%[output]) \n\t"
+ "sh %[temp1], 320(%[output]) \n\t"
+ "sh %[temp2], 384(%[output]) \n\t"
+ "sh %[temp3], 448(%[output]) \n\t"
+ "sh %[load1], 1536(%[output]) \n\t"
+ "sh %[load2], 1600(%[output]) \n\t"
+ "sh %[load3], 1664(%[output]) \n\t"
+ "sh %[load4], 1728(%[output]) \n\t"
+
+ : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+ [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+ [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+ : [step1_4] "r"(step1_4), [step1_27] "r"(step1_27),
+ [step1_5] "r"(step1_5), [step1_26] "r"(step1_26),
+ [step1_6] "r"(step1_6), [step1_25] "r"(step1_25),
+ [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
+ [output] "r"(output));
- temp21 = (step2_23 + step2_24) * cospi_16_64;
- step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+ __asm__ __volatile__(
+ "add %[temp0], %[step1_8], %[step1_23] \n\t"
+ "add %[temp1], %[step1_9], %[step1_22] \n\t"
+ "add %[temp2], %[step1_10], %[step1_21] \n\t"
+ "add %[temp3], %[step1_11], %[step1_20] \n\t"
+ "sub %[load1], %[step1_11], %[step1_20] \n\t"
+ "sub %[load2], %[step1_10], %[step1_21] \n\t"
+ "sub %[load3], %[step1_9], %[step1_22] \n\t"
+ "sub %[load4], %[step1_8], %[step1_23] \n\t"
+ "sh %[temp0], 512(%[output]) \n\t"
+ "sh %[temp1], 576(%[output]) \n\t"
+ "sh %[temp2], 640(%[output]) \n\t"
+ "sh %[temp3], 704(%[output]) \n\t"
+ "sh %[load1], 1280(%[output]) \n\t"
+ "sh %[load2], 1344(%[output]) \n\t"
+ "sh %[load3], 1408(%[output]) \n\t"
+ "sh %[load4], 1472(%[output]) \n\t"
+
+ : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+ [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+ [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+ : [step1_8] "r"(step1_8), [step1_23] "r"(step1_23),
+ [step1_9] "r"(step1_9), [step1_22] "r"(step1_22),
+ [step1_10] "r"(step1_10), [step1_21] "r"(step1_21),
+ [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
+ [output] "r"(output));
- // final stage
- output[0 * 32] = step1_0 + step2_31;
- output[1 * 32] = step1_1 + step2_30;
- output[2 * 32] = step1_2 + step2_29;
- output[3 * 32] = step1_3 + step2_28;
- output[4 * 32] = step1_4 + step1_27;
- output[5 * 32] = step1_5 + step1_26;
- output[6 * 32] = step1_6 + step1_25;
- output[7 * 32] = step1_7 + step1_24;
- output[8 * 32] = step1_8 + step1_23;
- output[9 * 32] = step1_9 + step1_22;
- output[10 * 32] = step1_10 + step1_21;
- output[11 * 32] = step1_11 + step1_20;
- output[12 * 32] = step1_12 + step2_19;
- output[13 * 32] = step1_13 + step2_18;
- output[14 * 32] = step1_14 + step2_17;
- output[15 * 32] = step1_15 + step2_16;
- output[16 * 32] = step1_15 - step2_16;
- output[17 * 32] = step1_14 - step2_17;
- output[18 * 32] = step1_13 - step2_18;
- output[19 * 32] = step1_12 - step2_19;
- output[20 * 32] = step1_11 - step1_20;
- output[21 * 32] = step1_10 - step1_21;
- output[22 * 32] = step1_9 - step1_22;
- output[23 * 32] = step1_8 - step1_23;
- output[24 * 32] = step1_7 - step1_24;
- output[25 * 32] = step1_6 - step1_25;
- output[26 * 32] = step1_5 - step1_26;
- output[27 * 32] = step1_4 - step1_27;
- output[28 * 32] = step1_3 - step2_28;
- output[29 * 32] = step1_2 - step2_29;
- output[30 * 32] = step1_1 - step2_30;
- output[31 * 32] = step1_0 - step2_31;
+ __asm__ __volatile__(
+ "add %[temp0], %[step1_12], %[step2_19] \n\t"
+ "add %[temp1], %[step1_13], %[step2_18] \n\t"
+ "add %[temp2], %[step1_14], %[step2_17] \n\t"
+ "add %[temp3], %[step1_15], %[step2_16] \n\t"
+ "sub %[load1], %[step1_15], %[step2_16] \n\t"
+ "sub %[load2], %[step1_14], %[step2_17] \n\t"
+ "sub %[load3], %[step1_13], %[step2_18] \n\t"
+ "sub %[load4], %[step1_12], %[step2_19] \n\t"
+ "sh %[temp0], 768(%[output]) \n\t"
+ "sh %[temp1], 832(%[output]) \n\t"
+ "sh %[temp2], 896(%[output]) \n\t"
+ "sh %[temp3], 960(%[output]) \n\t"
+ "sh %[load1], 1024(%[output]) \n\t"
+ "sh %[load2], 1088(%[output]) \n\t"
+ "sh %[load3], 1152(%[output]) \n\t"
+ "sh %[load4], 1216(%[output]) \n\t"
+
+ : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+ [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+ [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+ : [step1_12] "r"(step1_12), [step2_19] "r"(step2_19),
+ [step1_13] "r"(step1_13), [step2_18] "r"(step2_18),
+ [step1_14] "r"(step1_14), [step2_17] "r"(step2_17),
+ [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
+ [output] "r"(output));
input += 32;
output += 1;
@@ -836,7 +966,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
}
void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+ int stride) {
DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
int16_t *outptr = out;
uint32_t pos = 45;
@@ -850,7 +980,7 @@ void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
idct32_rows_dspr2(input, outptr, 32);
// Columns
- vpx_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
+ vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
}
void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
@@ -941,7 +1071,7 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
"abs %[absa1], %[a1] \n\t"
"replv.qb %[vector_a1], %[absa1] \n\t"
- : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [absa1] "=&r"(absa1), [vector_a1] "=&r"(vector_a1)
: [a1] "r"(a1));
for (r = 32; r--;) {
@@ -980,12 +1110,71 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
[dest] "+&r"(dest)
: [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
+ } else if (a1 > 255) {
+ int32_t a11, a12, vector_a11, vector_a12;
+
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ a11 = a1 >> 1;
+ a12 = a1 - a11;
+ __asm__ __volatile__(
+ "replv.qb %[vector_a11], %[a11] \n\t"
+ "replv.qb %[vector_a12], %[a12] \n\t"
+
+ : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+ : [a11] "r"(a11), [a12] "r"(a12));
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+ [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+ [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+ [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+ [vector_a12] "r"(vector_a12));
+ }
} else {
/* use quad-byte
* input and output memory are four byte aligned */
__asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
- : [vector_a1] "=r"(vector_a1)
+ : [vector_a1] "=&r"(vector_a1)
: [a1] "r"(a1));
for (r = 32; r--;) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c
index 516ea80f4ae..3f985b847b1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c
@@ -15,7 +15,7 @@
#if HAVE_DSPR2
void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
- int16_t step_0, step_1, step_2, step_3;
+ int step_0, step_1, step_2, step_3;
int Temp0, Temp1, Temp2, Temp3;
const int const_2_power_13 = 8192;
int i;
@@ -96,23 +96,13 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
}
void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride) {
- int16_t step_0, step_1, step_2, step_3;
+ int stride) {
+ int step_0, step_1, step_2, step_3;
int Temp0, Temp1, Temp2, Temp3;
const int const_2_power_13 = 8192;
+ const int const_255 = 255;
int i;
uint8_t *dest_pix;
- uint8_t *cm = vpx_ff_cropTbl;
-
- /* prefetch vpx_ff_cropTbl */
- prefetch_load(vpx_ff_cropTbl);
- prefetch_load(vpx_ff_cropTbl + 32);
- prefetch_load(vpx_ff_cropTbl + 64);
- prefetch_load(vpx_ff_cropTbl + 96);
- prefetch_load(vpx_ff_cropTbl + 128);
- prefetch_load(vpx_ff_cropTbl + 160);
- prefetch_load(vpx_ff_cropTbl + 192);
- prefetch_load(vpx_ff_cropTbl + 224);
for (i = 0; i < 4; ++i) {
dest_pix = (dest + i);
@@ -172,51 +162,62 @@ void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sra %[Temp0], %[Temp0], 4 \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "add %[Temp0], %[step_1], %[step_2] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "add %[Temp0], %[step_1], %[step_2] \n\t"
"addi %[Temp0], %[Temp0], 8 \n\t"
"sra %[Temp0], %[Temp0], 4 \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "sub %[Temp0], %[step_1], %[step_2] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "sub %[Temp0], %[step_1], %[step_2] \n\t"
"addi %[Temp0], %[Temp0], 8 \n\t"
"sra %[Temp0], %[Temp0], 4 \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "sub %[Temp0], %[step_0], %[step_3] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
+ "sub %[Temp0], %[step_0], %[step_3] \n\t"
"addi %[Temp0], %[Temp0], 8 \n\t"
"sra %[Temp0], %[Temp0], 4 \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
: [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
[step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
[dest_pix] "+r"(dest_pix)
- : [const_2_power_13] "r"(const_2_power_13),
+ : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255),
[cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
- [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
- [dest_stride] "r"(dest_stride));
+ [cospi_24_64] "r"(cospi_24_64), [input] "r"(input),
+ [stride] "r"(stride));
input += 4;
}
}
-void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
int16_t *outptr = out;
uint32_t pos = 45;
@@ -230,11 +231,10 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
vpx_idct4_rows_dspr2(input, outptr);
// Columns
- vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride);
}
-void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
int a1, absa1;
int r;
int32_t out;
@@ -271,10 +271,43 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
"lw %[t2], 0(%[dest]) \n\t"
"subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
"sw %[vector_a], 0(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ } else if (a1 > 255) {
+ int32_t a11, a12, vector_a11, vector_a12;
+
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ a11 = a1 >> 3;
+ a12 = a1 - (a11 * 7);
+
+ __asm__ __volatile__(
+ "replv.qb %[vector_a11], %[a11] \n\t"
+ "replv.qb %[vector_a12], %[a12] \n\t"
+
+ : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+ : [a11] "r"(a11), [a12] "r"(a12));
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__(
+ "lw %[t2], 4(%[dest]) \n\t"
+ "addu_s.qb %[vector_a], %[t2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a], %[vector_a12] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+ [vector_a12] "r"(vector_a12));
}
} else {
/* use quad-byte
@@ -288,10 +321,10 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
"lw %[t2], 0(%[dest]) \n\t"
"addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
"sw %[vector_a], 0(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c
index 08a6c78b6e4..d4d246965c3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c
@@ -192,24 +192,13 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
}
}
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
- int dest_stride) {
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
int Temp0, Temp1, Temp2, Temp3;
int i;
const int const_2_power_13 = 8192;
+ const int const_255 = 255;
uint8_t *dest_pix;
- uint8_t *cm = vpx_ff_cropTbl;
-
- /* prefetch vpx_ff_cropTbl */
- prefetch_load(vpx_ff_cropTbl);
- prefetch_load(vpx_ff_cropTbl + 32);
- prefetch_load(vpx_ff_cropTbl + 64);
- prefetch_load(vpx_ff_cropTbl + 96);
- prefetch_load(vpx_ff_cropTbl + 128);
- prefetch_load(vpx_ff_cropTbl + 160);
- prefetch_load(vpx_ff_cropTbl + 192);
- prefetch_load(vpx_ff_cropTbl + 224);
for (i = 0; i < 8; ++i) {
dest_pix = (dest + i);
@@ -356,70 +345,94 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sra %[Temp0], %[Temp0], 5 \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
"add %[Temp0], %[step1_1], %[step1_6] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"sra %[Temp0], %[Temp0], 5 \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
"add %[Temp0], %[step1_2], %[step1_5] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"sra %[Temp0], %[Temp0], 5 \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
"add %[Temp0], %[step1_3], %[step1_4] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"sra %[Temp0], %[Temp0], 5 \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
"sub %[Temp0], %[step1_3], %[step1_4] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"sra %[Temp0], %[Temp0], 5 \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
"sub %[Temp0], %[step1_2], %[step1_5] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"sra %[Temp0], %[Temp0], 5 \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
"sub %[Temp0], %[step1_1], %[step1_6] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"sra %[Temp0], %[Temp0], 5 \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
"sub %[Temp0], %[step1_0], %[step1_7] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
- "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
"lbu %[Temp1], 0(%[dest_pix]) \n\t"
"addi %[Temp0], %[Temp0], 16 \n\t"
"sra %[Temp0], %[Temp0], 5 \n\t"
"add %[Temp1], %[Temp1], %[Temp0] \n\t"
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "slt %[Temp2], %[Temp1], %[const_255] \n\t"
+ "slt %[Temp3], $zero, %[Temp1] \n\t"
+ "movz %[Temp1], %[const_255], %[Temp2] \n\t"
+ "movz %[Temp1], $zero, %[Temp3] \n\t"
+ "sb %[Temp1], 0(%[dest_pix]) \n\t"
: [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
[step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
@@ -427,19 +440,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
[step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
[Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
- : [const_2_power_13] "r"(const_2_power_13),
+ : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255),
[cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
[cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
[cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
- [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
- [dest_stride] "r"(dest_stride));
+ [cospi_24_64] "r"(cospi_24_64), [input] "r"(input),
+ [stride] "r"(stride));
input += 8;
}
}
-void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
int16_t *outptr = out;
uint32_t pos = 45;
@@ -451,11 +463,10 @@ void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
idct8_rows_dspr2(input, outptr, 8);
// Then transform columns and add to dest
- idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ idct8_columns_add_blk_dspr2(&out[0], dest, stride);
}
-void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
int16_t *outptr = out;
uint32_t pos = 45;
@@ -490,11 +501,10 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
: [outptr] "r"(outptr));
// Then transform columns and add to dest
- idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ idct8_columns_add_blk_dspr2(&out[0], dest, stride);
}
-void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
uint32_t pos = 45;
int32_t out;
int32_t r;
@@ -533,11 +543,47 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
"subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
"sw %[vector_1], 0(%[dest]) \n\t"
"sw %[vector_2], 4(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
[vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+ }
+ } else if (a1 > 255) {
+ int32_t a11, a12, vector_a11, vector_a12;
+
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ a11 = a1 >> 2;
+ a12 = a1 - (a11 * 3);
+
+ __asm__ __volatile__(
+ "replv.qb %[vector_a11], %[a11] \n\t"
+ "replv.qb %[vector_a12], %[a12] \n\t"
+
+ : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+ : [a11] "r"(a11), [a12] "r"(a12));
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__(
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t"
+ "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t"
+ "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+ [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
+ : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+ [vector_a12] "r"(vector_a12));
}
} else {
/* use quad-byte
@@ -555,11 +601,11 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
"addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
"sw %[vector_1], 0(%[dest]) \n\t"
"sw %[vector_2], 4(%[dest]) \n\t"
- "add %[dest], %[dest], %[dest_stride] \n\t"
+ "add %[dest], %[dest], %[stride] \n\t"
: [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
[vector_2] "=&r"(vector_2), [dest] "+r"(dest)
- : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+ : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
}
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h
index da100f6a980..f077fa4814a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h
@@ -15,19 +15,24 @@
#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
{ \
- v8i16 k0_m = __msa_fill_h(cnst0); \
- v4i32 s0_m, s1_m, s2_m, s3_m; \
+ v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \
+ v8i16 k0_m, k1_m, k2_m, zero = { 0 }; \
\
- s0_m = (v4i32)__msa_fill_h(cnst1); \
- k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
+ k0_m = __msa_fill_h(cnst0); \
+ k1_m = __msa_fill_h(cnst1); \
+ k2_m = __msa_ilvev_h((v8i16)k1_m, k0_m); \
+ k0_m = __msa_ilvev_h((v8i16)zero, k0_m); \
+ k1_m = __msa_ilvev_h(k1_m, (v8i16)zero); \
\
- ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
+ ILVRL_H2_SW(reg1, reg0, s5_m, s4_m); \
ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
- DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
+ DOTP_SH2_SW(s5_m, s4_m, k0_m, k0_m, s1_m, s0_m); \
+ s1_m = __msa_dpsub_s_w(s1_m, (v8i16)s5_m, k1_m); \
+ s0_m = __msa_dpsub_s_w(s0_m, (v8i16)s4_m, k1_m); \
SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
\
- DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
+ DOTP_SH2_SW(s3_m, s2_m, k2_m, k2_m, s1_m, s0_m); \
SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
index 2909beb0f6c..bb20ea27421 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
@@ -48,6 +48,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c
endif # CONFIG_VP9_HIGHBITDEPTH
ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
@@ -56,6 +57,7 @@ DSP_SRCS-yes += deblock.c
DSP_SRCS-yes += postproc.h
DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
+DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c
DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
endif # CONFIG_POSTPROC
@@ -140,14 +142,11 @@ DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c
DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c
ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes += arm/loopfilter_vertical_4_dual_neon.c
DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM)
DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM)
DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM)
else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes += arm/loopfilter_neon.c
-endif # HAVE_NEON
+DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c
endif # HAVE_NEON_ASM
DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_msa.h
@@ -203,17 +202,6 @@ endif # ARCH_X86_64
DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
-else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes += arm/idct16x16_add_neon.c
-endif # HAVE_NEON
-endif # HAVE_NEON_ASM
-DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c
-DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c
-DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
-
DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/idct8x8_msa.c
@@ -226,6 +214,9 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+else # CONFIG_VP9_HIGHBITDEPTH
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct4x4_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct8x8_add_neon.c
endif # !CONFIG_VP9_HIGHBITDEPTH
ifeq ($(HAVE_NEON_ASM),yes)
@@ -235,15 +226,21 @@ DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct16x16_neon.c
else
DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c
endif # HAVE_NEON_ASM
DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c
endif # CONFIG_VP9
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ee403be3975..ee1b2927938 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -96,6 +96,7 @@ specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_8x8 neon/;
add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d153_predictor_8x8 ssse3/;
@@ -139,6 +140,7 @@ specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2/;
add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_16x16 neon/;
add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d153_predictor_16x16 ssse3/;
@@ -167,7 +169,7 @@ specialize qw/vpx_d207_predictor_32x32 ssse3/;
add_proto qw/void vpx_d207e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_32x32 ssse3/;
+specialize qw/vpx_d45_predictor_32x32 neon ssse3/;
add_proto qw/void vpx_d45e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
@@ -182,6 +184,7 @@ specialize qw/vpx_h_predictor_32x32 neon msa sse2/;
add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_32x32 neon/;
add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d153_predictor_32x32 ssse3/;
@@ -211,6 +214,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -219,33 +223,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_v_predictor_4x4 sse2/;
+ specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_tm_predictor_4x4 sse2/;
+ specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_predictor_4x4 sse2/;
+ specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -254,33 +264,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_v_predictor_8x8 sse2/;
+ specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_tm_predictor_8x8 sse2/;
+ specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_predictor_8x8 sse2/;;
+ specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -289,33 +305,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_v_predictor_16x16 sse2/;
+ specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_tm_predictor_16x16 sse2/;
+ specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_predictor_16x16 sse2/;
+ specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -324,27 +346,32 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_v_predictor_32x32 sse2/;
+ specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_tm_predictor_32x32 sse2/;
+ specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_predictor_32x32 sse2/;
+ specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/;
} # CONFIG_VP9_HIGHBITDEPTH
#
@@ -585,193 +612,193 @@ if (vpx_config("CONFIG_VP9") eq "yes") {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
- add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_iwht4x4_16_add sse2/;
- add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct4x4_1_add neon/;
- add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct8x8_1_add neon/;
- add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct32x32_1_add sse2/;
- add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
} else {
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct4x4_16_add neon sse2/;
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct4x4_1_add neon sse2/;
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_1_add neon sse2/;
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct16x16_256_add sse2/;
+ add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+ specialize qw/vpx_idct16x16_256_add neon sse2/;
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct16x16_10_add sse2/;
+ add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+ specialize qw/vpx_idct16x16_10_add neon sse2/;
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_1_add neon sse2/;
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64";
+ add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+ specialize qw/vpx_idct32x32_1024_add neon sse2/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64";
+ add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+ specialize qw/vpx_idct32x32_135_add neon sse2/, "$ssse3_x86_64";
# Need to add 135 eob idct32x32 implementations.
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64";
+ add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+ specialize qw/vpx_idct32x32_34_add neon sse2/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_1_add neon sse2/;
- add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/vpx_highbd_idct4x4_16_add sse2/;
+ add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
- add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/vpx_highbd_idct8x8_64_add sse2/;
+ add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct8x8_64_add neon sse2/;
- add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/vpx_highbd_idct8x8_12_add sse2/;
+ add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct8x8_12_add neon sse2/;
- add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct16x16_256_add sse2/;
- add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct16x16_10_add sse2/;
} # CONFIG_EMULATE_HARDWARE
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
} else {
- add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct4x4_1_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct4x4_16_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
- # Need to add 135 eob idct32x32 implementations.
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
- $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon;
$vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
$vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
- add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
- add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
- add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_iwht4x4_1_add msa/;
- add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_iwht4x4_16_add msa sse2/;
} # CONFIG_EMULATE_HARDWARE
} # CONFIG_VP9_HIGHBITDEPTH
@@ -1724,15 +1751,13 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
specialize qw/vpx_plane_add_noise sse2 msa/;
add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
- specialize qw/vpx_mbpost_proc_down sse2 msa/;
- $vpx_mbpost_proc_down_sse2=vpx_mbpost_proc_down_xmm;
+ specialize qw/vpx_mbpost_proc_down sse2 neon msa/;
add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
- specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;
- $vpx_mbpost_proc_across_ip_sse2=vpx_mbpost_proc_across_ip_xmm;
+ specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/;
add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
- specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/;
+ specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
index 6df360df44f..ebca50930a0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
@@ -230,11 +230,11 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2):
ret
%undef flimit
-;void vpx_mbpost_proc_down_xmm(unsigned char *dst,
-; int pitch, int rows, int cols,int flimit)
+;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
+; int pitch, int rows, int cols,int flimit)
extern sym(vpx_rv)
-global sym(vpx_mbpost_proc_down_xmm) PRIVATE
-sym(vpx_mbpost_proc_down_xmm):
+global sym(vpx_mbpost_proc_down_sse2) PRIVATE
+sym(vpx_mbpost_proc_down_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@@ -462,10 +462,10 @@ sym(vpx_mbpost_proc_down_xmm):
%undef flimit4
-;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src,
-; int pitch, int rows, int cols,int flimit)
-global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE
-sym(vpx_mbpost_proc_across_ip_xmm):
+;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
+; int pitch, int rows, int cols,int flimit)
+global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE
+sym(vpx_mbpost_proc_across_ip_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index d5fc1440c41..487a474a675 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -402,10 +402,10 @@ void iadst4_sse2(__m128i *in) {
MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \
stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \
\
- stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
} \
\
/* Stage3 */ \
@@ -413,10 +413,10 @@ void iadst4_sse2(__m128i *in) {
const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
\
- stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
\
tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
@@ -438,14 +438,14 @@ void iadst4_sse2(__m128i *in) {
} \
\
/* Stage4 */ \
- out0 = _mm_adds_epi16(stp1_0, stp2_7); \
- out1 = _mm_adds_epi16(stp1_1, stp1_6); \
- out2 = _mm_adds_epi16(stp1_2, stp1_5); \
- out3 = _mm_adds_epi16(stp1_3, stp2_4); \
- out4 = _mm_subs_epi16(stp1_3, stp2_4); \
- out5 = _mm_subs_epi16(stp1_2, stp1_5); \
- out6 = _mm_subs_epi16(stp1_1, stp1_6); \
- out7 = _mm_subs_epi16(stp1_0, stp2_7); \
+ out0 = _mm_add_epi16(stp1_0, stp2_7); \
+ out1 = _mm_add_epi16(stp1_1, stp1_6); \
+ out2 = _mm_add_epi16(stp1_2, stp1_5); \
+ out3 = _mm_add_epi16(stp1_3, stp2_4); \
+ out4 = _mm_sub_epi16(stp1_3, stp2_4); \
+ out5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ out6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ out7 = _mm_sub_epi16(stp1_0, stp2_7); \
}
void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
@@ -866,8 +866,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
stp2_0 = _mm_packs_epi32(tmp0, tmp2);
stp2_2 = _mm_packs_epi32(tmp6, tmp4);
- tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
- tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
+ tmp0 = _mm_add_epi16(stp1_4, stp1_5);
+ tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
stp2_4 = tmp0;
stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
@@ -878,8 +878,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
{
const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
- tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
- tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
+ tmp4 = _mm_add_epi16(stp2_0, stp2_2);
+ tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
@@ -896,10 +896,10 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
}
// Stage4
- tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
- tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
- tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
- tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
+ tmp0 = _mm_add_epi16(stp1_3, stp2_4);
+ tmp1 = _mm_add_epi16(stp1_2, stp1_5);
+ tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
+ tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
@@ -3449,7 +3449,7 @@ static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
__m128i ubounded, retval;
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
ubounded = _mm_cmpgt_epi16(value, max);
retval = _mm_andnot_si128(ubounded, value);
ubounded = _mm_and_si128(ubounded, max);
@@ -4012,7 +4012,7 @@ void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
__m128i dc_value, d;
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
int a, i, j;
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
tran_low_t out;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
index 20baf820f6b..dee64e3ad36 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -263,7 +263,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
RET
-; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
+; inverse 8x8 2D-DCT transform with only first 12 coeffs non-zero
cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
mova m8, [pd_8192]
mova m11, [pw_16]
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c b/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
index 79c60f7a191..4f9d480ade6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
@@ -58,8 +58,12 @@ int arm_cpu_caps(void) {
#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
+#endif
+#ifndef WIN32_EXTRA_LEAN
#define WIN32_EXTRA_LEAN
+#endif
#include <windows.h>
int arm_cpu_caps(void) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h
index 4aae30e9474..c1f1b602750 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h
@@ -21,6 +21,8 @@
/*
* Win32 specific includes
*/
+#undef NOMINMAX
+#define NOMINMAX
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
index 6ba02cf1fcc..5aabb9e3afa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
@@ -140,6 +140,11 @@ static INLINE uint64_t xgetbv(void) {
#endif
#if defined(_MSC_VER) && _MSC_VER >= 1700
+#undef NOMINMAX
+#define NOMINMAX
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
#include <windows.h>
#if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
#define getenv(x) NULL
diff --git a/chromium/third_party/libvpx/source/libvpx/vpxenc.c b/chromium/third_party/libvpx/source/libvpx/vpxenc.c
index a0f760574c8..9cd10ab2eb4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpxenc.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpxenc.c
@@ -1657,7 +1657,7 @@ static void get_cx_data(struct stream_state *stream,
*got_data = 0;
while ((pkt = vpx_codec_get_cx_data(&stream->encoder, &iter))) {
static size_t fsize = 0;
- static int64_t ivf_header_pos = 0;
+ static FileOffset ivf_header_pos = 0;
switch (pkt->kind) {
case VPX_CODEC_CX_FRAME_PKT:
@@ -1683,7 +1683,7 @@ static void get_cx_data(struct stream_state *stream,
fsize += pkt->data.frame.sz;
if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
- const int64_t currpos = ftello(stream->file);
+ const FileOffset currpos = ftello(stream->file);
fseeko(stream->file, ivf_header_pos, SEEK_SET);
ivf_write_frame_size(stream->file, fsize);
fseeko(stream->file, currpos, SEEK_SET);