BASELINE: Update Chromium to 57.0.2987.144

Change-Id: I29db402ff696c71a04c4dbaec822c2e53efe0267 Reviewed-by: Peter Varga <pvarga@inf.u-szeged.hu>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2017-04-05 14:08:31 +0200
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2017-04-11 07:46:53 +0000
commit: 6a4cabb866f66d4128a97cdc6d9d08ce074f1247 (patch)
tree: ab00f70a5e89278d6a0d16ff0c42578dc4d84a2d /chromium/third_party/libvpx
parent: e733310db58160074f574c429d48f8308c0afe17 (diff)
download: qtwebengine-chromium-6a4cabb866f66d4128a97cdc6d9d08ce074f1247.tar.gz
134 files changed, 8809 insertions, 4338 deletions
diff --git a/chromium/third_party/libvpx/BUILD.gn b/chromium/third_party/libvpx/BUILD.gn
index 0a39291205b..2c79ee0f405 100644
--- a/chromium/third_party/libvpx/BUILD.gn
+++ b/chromium/third_party/libvpx/BUILD.gn
@@ -38,9 +38,9 @@ if (is_nacl) {
   # vpx_config.asm
   if (is_ios && current_cpu == "arm") {
     os_category = current_os
-  } else if (is_posix) { # Should cover linux, mac, and the ios simulator.
+  } else if (is_posix) {  # Should cover linux, mac, and the ios simulator.
     os_category = "linux"
-  } else { # This should only match windows.
+  } else {  # This should only match windows.
     os_category = current_os
   }
   platform_include_dir =
diff --git a/chromium/third_party/libvpx/README.chromium b/chromium/third_party/libvpx/README.chromium
index 390b58ebdc1..7fcfd85f425 100644
--- a/chromium/third_party/libvpx/README.chromium
+++ b/chromium/third_party/libvpx/README.chromium
@@ -5,9 +5,9 @@ License: BSD
 License File: source/libvpx/LICENSE
 Security Critical: yes
 
-Date: Tuesday November 08 2016
+Date: Monday January 09 2017
 Branch: master
-Commit: 5c64c01c7ca3780d30f140e54a30088f780ae66a
+Commit: 5b1a8ca5e846f838062becaec9ed6b5ecef306e5
 
 Description:
 Contains the sources used to compile libvpx binaries used by Google Chrome and
diff --git a/chromium/third_party/libvpx/libvpx_srcs.gni b/chromium/third_party/libvpx/libvpx_srcs.gni
index 37850e29793..664a0fdbb2e 100644
--- a/chromium/third_party/libvpx/libvpx_srcs.gni
+++ b/chromium/third_party/libvpx/libvpx_srcs.gni
@@ -1515,15 +1515,16 @@ libvpx_srcs_arm_neon = [
   "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h",
   "//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c",
+  "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c",
+  "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c",
-  "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c",
@@ -1986,14 +1987,15 @@ libvpx_srcs_arm_neon_cpu_detect_neon = [
   "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c",
+  "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c",
+  "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c",
-  "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c",
@@ -2299,11 +2301,12 @@ libvpx_srcs_arm64 = [
   "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h",
   "//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c",
+  "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c",
-  "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c",
+  "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c",
   "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c",
diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h
index 5f0e862cbfa..d07bcaa3def 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h
@@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h
index a5c50f21727..c9d867d00cd 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h
@@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
 void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon
 
 void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon
 
 void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
 
 void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
 
 void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
@@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
 
 void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon
 
 void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
 void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_neon
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
 #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon
 
 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c
+void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c
+void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon
 
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
 #define vpx_plane_add_noise vpx_plane_add_noise_c
 
 void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c
+void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b vpx_quantize_b_c
diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h
index 5f0e862cbfa..d07bcaa3def 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h
@@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c
index 5f93ebfb676..56a5348abd6 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c
+++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --enable-postproc --enable-multi-res-encoding --enable-temporal-denoising --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384 --enable-realtime-only --disable-install-docs";
+static const char* const cfg = "--target=armv8-linux-gcc --enable-external-build --enable-postproc --enable-multi-res-encoding --enable-temporal-denoising --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384 --enable-realtime-only --disable-install-docs";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h
index a5c50f21727..c9d867d00cd 100644
--- a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h
@@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
 void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon
 
 void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon
 
 void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
 
 void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
 
 void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
@@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
 
 void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon
 
 void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
 void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_neon
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
 #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon
 
 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c
+void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c
+void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon
 
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
 #define vpx_plane_add_noise vpx_plane_add_noise_c
 
 void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c
+void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b vpx_quantize_b_c
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
index f7ac2dc300a..789724ffb93 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
@@ -70,13 +70,13 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h
index 0028d86c3ed..2712530f99c 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h
@@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
 void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d135_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d135_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 RTCD_EXTERN void (*vpx_d135_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d135_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
@@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_
 RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -312,53 +316,53 @@ RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, in
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width);
@@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
 void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
 RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 
 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c
+void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c
+void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
 #define vpx_plane_add_noise vpx_plane_add_noise_c
 
 void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c
+void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b vpx_quantize_b_c
@@ -861,10 +868,18 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON) vpx_convolve_avg = vpx_convolve_avg_neon;
     vpx_convolve_copy = vpx_convolve_copy_c;
     if (flags & HAS_NEON) vpx_convolve_copy = vpx_convolve_copy_neon;
+    vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_c;
+    if (flags & HAS_NEON) vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_neon;
+    vpx_d135_predictor_32x32 = vpx_d135_predictor_32x32_c;
+    if (flags & HAS_NEON) vpx_d135_predictor_32x32 = vpx_d135_predictor_32x32_neon;
     vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_c;
     if (flags & HAS_NEON) vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_neon;
+    vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_c;
+    if (flags & HAS_NEON) vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_neon;
     vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c;
     if (flags & HAS_NEON) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_neon;
+    vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c;
+    if (flags & HAS_NEON) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_neon;
     vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_c;
     if (flags & HAS_NEON) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_neon;
     vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c;
@@ -932,7 +947,7 @@ static void setup_rtcd_internal(void)
     vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c;
     if (flags & HAS_NEON) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_neon;
     vpx_idct32x32_135_add = vpx_idct32x32_135_add_c;
-    if (flags & HAS_NEON) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_neon;
+    if (flags & HAS_NEON) vpx_idct32x32_135_add = vpx_idct32x32_135_add_neon;
     vpx_idct32x32_1_add = vpx_idct32x32_1_add_c;
     if (flags & HAS_NEON) vpx_idct32x32_1_add = vpx_idct32x32_1_add_neon;
     vpx_idct32x32_34_add = vpx_idct32x32_34_add_c;
@@ -975,10 +990,16 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_neon;
     vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c;
     if (flags & HAS_NEON) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_neon;
+    vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c;
+    if (flags & HAS_NEON) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_neon;
+    vpx_mbpost_proc_down = vpx_mbpost_proc_down_c;
+    if (flags & HAS_NEON) vpx_mbpost_proc_down = vpx_mbpost_proc_down_neon;
     vpx_minmax_8x8 = vpx_minmax_8x8_c;
     if (flags & HAS_NEON) vpx_minmax_8x8 = vpx_minmax_8x8_neon;
     vpx_mse16x16 = vpx_mse16x16_c;
     if (flags & HAS_NEON) vpx_mse16x16 = vpx_mse16x16_neon;
+    vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_c;
+    if (flags & HAS_NEON) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_neon;
     vpx_sad16x16 = vpx_sad16x16_c;
     if (flags & HAS_NEON) vpx_sad16x16 = vpx_sad16x16_neon;
     vpx_sad16x16x4d = vpx_sad16x16x4d_c;
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h
index 5f0e862cbfa..d07bcaa3def 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h
@@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h
index a5c50f21727..c9d867d00cd 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h
@@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
 void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon
 
 void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon
 
 void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
 
 void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
 
 void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
@@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
 
 void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon
 
 void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
 void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_neon
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
 #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon
 
 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c
+void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c
+void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon
 
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
 #define vpx_plane_add_noise vpx_plane_add_noise_c
 
 void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c
+void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b vpx_quantize_b_c
diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h
index 206f5e5dba4..cd5726c2e6b 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h
@@ -68,10 +68,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h
index 6aa4b73856e..89b44dc986c 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h
@@ -271,40 +271,40 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff)
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -313,10 +313,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_c
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h
index 5f0e862cbfa..d07bcaa3def 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h
@@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h
index a5c50f21727..c9d867d00cd 100644
--- a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h
@@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
 void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon
 
 void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon
 
 void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
 
 void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
 
 void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
@@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
 
 void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon
 
 void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
 void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_neon
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
 #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon
 
 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c
+void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c
+void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon
 
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
 #define vpx_plane_add_noise vpx_plane_add_noise_c
 
 void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c
+void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b vpx_quantize_b_c
diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h
index f0824a37a80..8251c1b5a19 100644
--- a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h
@@ -83,10 +83,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
 #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
 
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
 
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
 void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -110,10 +110,10 @@ void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, ui
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h
index 163cf7611a8..f537568dd91 100644
--- a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h
@@ -901,43 +901,43 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
 
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
 
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
 
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
 
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
 
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
 
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
 
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
 
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
 
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
 
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
 
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
 
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
 
 void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1177,40 +1177,40 @@ void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -1219,10 +1219,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_c
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h
index 8b235e876e3..3addf41714b 100644
--- a/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h
@@ -297,7 +297,7 @@ static void setup_rtcd_internal(void)
     vp8_mbuverror = vp8_mbuverror_c;
     if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2;
     vp8_refining_search_sad = vp8_refining_search_sad_c;
-    if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
+    if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4;
     vp8_regular_quantize_b = vp8_regular_quantize_b_c;
     if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
     if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h
index 55c229554e3..28b5da86510 100644
--- a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
 #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
 
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
 
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
 void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,13 +125,13 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_c
diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h
index 5e31286207d..58079fa420e 100644
--- a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h
@@ -1084,49 +1084,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
 
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
 
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
 
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
 
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
 
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
 
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
 
 void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1423,53 +1423,53 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 RTCD_EXTERN void (*vpx_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1479,11 +1479,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
 void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1537,11 +1537,11 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
 RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 
 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -2618,9 +2618,9 @@ static void setup_rtcd_internal(void)
     vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c;
     if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2;
     vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c;
-    if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_xmm;
+    if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2;
     vpx_mbpost_proc_down = vpx_mbpost_proc_down_c;
-    if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_xmm;
+    if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2;
     vpx_minmax_8x8 = vpx_minmax_8x8_c;
     if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2;
     vpx_mse16x16 = vpx_mse16x16_c;
diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h
index 3d80ce20e90..403db512b5d 100644
--- a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h
@@ -68,10 +68,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h
index 3dfc85323ca..8a27f964d1d 100644
--- a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h
@@ -271,40 +271,40 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff)
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -313,10 +313,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_c
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h
index 3d80ce20e90..403db512b5d 100644
--- a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h
@@ -68,10 +68,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h
index 3dfc85323ca..8a27f964d1d 100644
--- a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h
@@ -271,40 +271,40 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff)
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -313,10 +313,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_c
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h
index c66d7913431..8dcc9eb2b99 100644
--- a/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h
@@ -169,7 +169,7 @@ int vp8_mbuverror_sse2(struct macroblock *mb);
 
 int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sadx4
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
 void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
@@ -240,8 +240,6 @@ static void setup_rtcd_internal(void)
     vp8_full_search_sad = vp8_full_search_sad_c;
     if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
     if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
-    vp8_refining_search_sad = vp8_refining_search_sad_c;
-    if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
     vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
     if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
     vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2;
diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h
index f747ed67d01..072f858e67e 100644
--- a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
 #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
 
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
 
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
 void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,12 +125,12 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h
index 1188bb43b56..bcb567d8ec6 100644
--- a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h
@@ -1091,49 +1091,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
 
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
 
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
 
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
 
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
 
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
 
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
 
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
 
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
 
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
 
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
 
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
 
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
 
 void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1430,58 +1430,58 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1491,11 +1491,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
 void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_sse2
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1549,12 +1549,12 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
 #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
 
 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_xmm
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_xmm
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2
 
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h
index 8b235e876e3..3addf41714b 100644
--- a/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h
@@ -297,7 +297,7 @@ static void setup_rtcd_internal(void)
     vp8_mbuverror = vp8_mbuverror_c;
     if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2;
     vp8_refining_search_sad = vp8_refining_search_sad_c;
-    if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
+    if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4;
     vp8_regular_quantize_b = vp8_regular_quantize_b_c;
     if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
     if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h
index 55c229554e3..28b5da86510 100644
--- a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
 #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
 
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
 
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
 void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,13 +125,13 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_c
diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h
index 5e31286207d..58079fa420e 100644
--- a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h
@@ -1084,49 +1084,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
 
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
 
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
 
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
 
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
 
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
 
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
 
 void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1423,53 +1423,53 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 RTCD_EXTERN void (*vpx_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1479,11 +1479,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
 void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1537,11 +1537,11 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
 RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 
 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -2618,9 +2618,9 @@ static void setup_rtcd_internal(void)
     vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c;
     if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2;
     vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c;
-    if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_xmm;
+    if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2;
     vpx_mbpost_proc_down = vpx_mbpost_proc_down_c;
-    if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_xmm;
+    if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2;
     vpx_minmax_8x8 = vpx_minmax_8x8_c;
     if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2;
     vpx_mse16x16 = vpx_mse16x16_c;
diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h
index c66d7913431..8dcc9eb2b99 100644
--- a/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h
@@ -169,7 +169,7 @@ int vp8_mbuverror_sse2(struct macroblock *mb);
 
 int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sadx4
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
 void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
@@ -240,8 +240,6 @@ static void setup_rtcd_internal(void)
     vp8_full_search_sad = vp8_full_search_sad_c;
     if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
     if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
-    vp8_refining_search_sad = vp8_refining_search_sad_c;
-    if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
     vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
     if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
     vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2;
diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h
index f747ed67d01..072f858e67e 100644
--- a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
 #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
 
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
 
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
 void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,12 +125,12 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h
index 1188bb43b56..bcb567d8ec6 100644
--- a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h
@@ -1091,49 +1091,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
 
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
 
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
 
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
 
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
 
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
 
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
 
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
 
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
 
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
 
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
 
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
 
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
 
 void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1430,58 +1430,58 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1491,11 +1491,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
 void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_sse2
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1549,12 +1549,12 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
 #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
 
 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_xmm
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_xmm
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2
 
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
diff --git a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h
index f0824a37a80..8251c1b5a19 100644
--- a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h
@@ -83,10 +83,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
 #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
 
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
 
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
 void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -110,10 +110,10 @@ void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, ui
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h
index 163cf7611a8..f537568dd91 100644
--- a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h
@@ -901,43 +901,43 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
 
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
 
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
 
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
 
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
 
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
 
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
 
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
 
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
 
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
 
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
 
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
 
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
 
 void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1177,40 +1177,40 @@ void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
@@ -1219,10 +1219,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_c
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
diff --git a/chromium/third_party/libvpx/source/config/vpx_version.h b/chromium/third_party/libvpx/source/config/vpx_version.h
index 97666fffaf6..07f046ed1bc 100644
--- a/chromium/third_party/libvpx/source/config/vpx_version.h
+++ b/chromium/third_party/libvpx/source/config/vpx_version.h
@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
 #define VERSION_MINOR  6
 #define VERSION_PATCH  0
-#define VERSION_EXTRA  "702-g5c64c01"
+#define VERSION_EXTRA  "903-g5b1a8ca5e"
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.0-702-g5c64c01"
-#define VERSION_STRING      " v1.6.0-702-g5c64c01"
+#define VERSION_STRING_NOSP "v1.6.0-903-g5b1a8ca5e"
+#define VERSION_STRING      " v1.6.0-903-g5b1a8ca5e"
diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h
index 8b235e876e3..3addf41714b 100644
--- a/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h
@@ -297,7 +297,7 @@ static void setup_rtcd_internal(void)
     vp8_mbuverror = vp8_mbuverror_c;
     if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2;
     vp8_refining_search_sad = vp8_refining_search_sad_c;
-    if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
+    if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4;
     vp8_regular_quantize_b = vp8_regular_quantize_b_c;
     if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
     if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h
index 55c229554e3..28b5da86510 100644
--- a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
 #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
 
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
 
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
 void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,13 +125,13 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_c
diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h
index 5e31286207d..58079fa420e 100644
--- a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h
@@ -1084,49 +1084,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
 
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
 
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
 
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
 
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
 
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
 
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
 
 void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1423,53 +1423,53 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 RTCD_EXTERN void (*vpx_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1479,11 +1479,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
 void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1537,11 +1537,11 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
 RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 
 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
@@ -2618,9 +2618,9 @@ static void setup_rtcd_internal(void)
     vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c;
     if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2;
     vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c;
-    if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_xmm;
+    if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2;
     vpx_mbpost_proc_down = vpx_mbpost_proc_down_c;
-    if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_xmm;
+    if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2;
     vpx_minmax_8x8 = vpx_minmax_8x8_c;
     if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2;
     vpx_mse16x16 = vpx_mse16x16_c;
diff --git a/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h
index c66d7913431..8dcc9eb2b99 100644
--- a/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h
@@ -169,7 +169,7 @@ int vp8_mbuverror_sse2(struct macroblock *mb);
 
 int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sadx4
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
 void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
@@ -240,8 +240,6 @@ static void setup_rtcd_internal(void)
     vp8_full_search_sad = vp8_full_search_sad_c;
     if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
     if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
-    vp8_refining_search_sad = vp8_refining_search_sad_c;
-    if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4;
     vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
     if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
     vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2;
diff --git a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h
index f747ed67d01..072f858e67e 100644
--- a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h
@@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd);
 #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
 
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
 
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd);
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
 void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit);
@@ -125,12 +125,12 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch,
 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
 
-void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2
 
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
diff --git a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h
index 1188bb43b56..bcb567d8ec6 100644
--- a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h
+++ b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h
@@ -1091,49 +1091,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
 
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
 
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
 
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
 
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
 
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
 
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
 
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
 
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
 
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
 
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
 
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
 
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd);
 #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
 
 void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
@@ -1430,58 +1430,58 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1
 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
 #define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2
 
-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
 
-void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2
 
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
 
-void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2
 
-void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2
 
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -1491,11 +1491,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
 void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_sse2
 
-void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
-void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -1549,12 +1549,12 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
 #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
 
 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_xmm
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vpx_mbpost_proc_down vpx_mbpost_proc_down_xmm
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2
 
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
index 09bdc5d2f70..a88f90056e4 100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
@@ -64,6 +64,9 @@ CONFIG_DIR := $(LOCAL_PATH)/
 LIBVPX_PATH := $(LOCAL_PATH)/libvpx
 ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
 ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
+ifneq ($(V),1)
+  qexec := @
+endif
 
 # Use the makefiles generated by upstream configure to determine which files to
 # build. Also set any architecture-specific flags.
@@ -103,8 +106,8 @@ LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)
 
 .PRECIOUS: %.asm.S
 $(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm
-	@mkdir -p $(dir $@)
-	@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
+	$(qexec)mkdir -p $(dir $@)
+	$(qexec)$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
 
 # For building *_rtcd.h, which have rules in libs.mk
 TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
@@ -150,15 +153,27 @@ CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \
 LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)
 
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  ASM_INCLUDES := vpx_dsp/arm/idct_neon.asm.S
   CODEC_SRCS_ASM_NEON = $(foreach v, \
                         $(CODEC_SRCS_ASM_ARM_ALL),\
                         $(if $(findstring neon,$(v)),$(v),))
+  CODEC_SRCS_ASM_NEON := $(filter-out $(addprefix %, $(ASM_INCLUDES)), \
+                         $(CODEC_SRCS_ASM_NEON))
   CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \
                                 $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \
                                 $(CODEC_SRCS_ASM_NEON))
   LOCAL_SRC_FILES += $(patsubst %.S, \
                      %.S.neon, \
                      $(CODEC_SRCS_ASM_NEON_ADS2GAS))
+
+  NEON_ASM_TARGETS = $(patsubst %.S, \
+                     $(ASM_CNV_PATH)/libvpx/%.S, \
+                     $(CODEC_SRCS_ASM_NEON))
+# add a dependency to the full path to the ads2gas output to ensure the
+# includes are converted first.
+ifneq ($(strip $(NEON_ASM_TARGETS)),)
+$(NEON_ASM_TARGETS): $(addprefix $(ASM_CNV_PATH)/libvpx/, $(ASM_INCLUDES))
+endif
 endif
 
 LOCAL_CFLAGS += \
@@ -187,7 +202,7 @@ $$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h
 $$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h
 
 rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a
-ifneq ($(findstring $(TARGET_ARCH_ABI),$(rtcd_dep_template_CONFIG_ASM_ABIS)),)
+ifneq ($$(findstring $(TARGET_ARCH_ABI),$$(rtcd_dep_template_CONFIG_ASM_ABIS)),)
 $$(rtcd_dep_template_SRCS): vpx_config.asm
 endif
 endef
@@ -197,16 +212,17 @@ $(eval $(call rtcd_dep_template))
 .PHONY: clean
 clean:
 	@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
-	@$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
-	@$(RM) -r $(ASM_CNV_PATH)
-	@$(RM) $(CLEAN-OBJS)
+	$(qexec)$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
+	$(qexec)$(RM) -r $(ASM_CNV_PATH)
+	$(qexec)$(RM) $(CLEAN-OBJS)
 
 ifeq ($(ENABLE_SHARED),1)
+  LOCAL_CFLAGS += -fPIC
   include $(BUILD_SHARED_LIBRARY)
 else
   include $(BUILD_STATIC_LIBRARY)
 endif
 
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
-$(call import-module,cpufeatures)
+$(call import-module,android/cpufeatures)
 endif
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
index 65308a0bd0b..0b9663c777b 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -240,9 +240,9 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
       cfg->ts_layer_id[1] = 2;
       cfg->ts_layer_id[2] = 1;
       cfg->ts_layer_id[3] = 2;
-      // Use 40/20/40 bit allocation as example.
-      cfg->ts_target_bitrate[0] = 0.4f * bitrate;
-      cfg->ts_target_bitrate[1] = 0.6f * bitrate;
+      // Use 45/20/35 bit allocation as example.
+      cfg->ts_target_bitrate[0] = 0.45f * bitrate;
+      cfg->ts_target_bitrate[1] = 0.65f * bitrate;
       cfg->ts_target_bitrate[2] = bitrate;
 
       /* 0=L, 1=GF, 2=ARF */
@@ -460,7 +460,7 @@ int main(int argc, char **argv) {
 
   // Set the number of threads per encode/spatial layer.
   // (1, 1, 1) means no encoder threading.
-  cfg[0].g_threads = 2;
+  cfg[0].g_threads = 1;
   cfg[1].g_threads = 1;
   cfg[2].g_threads = 1;
 
@@ -507,9 +507,11 @@ int main(int argc, char **argv) {
 
   /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
   /* Enable denoising for the highest-resolution encoder. */
-  if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 4))
+  if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1))
     die_codec(&codec[0], "Failed to set noise_sensitivity");
-  for (i = 1; i < NUM_ENCODERS; i++) {
+  if (vpx_codec_control(&codec[1], VP8E_SET_NOISE_SENSITIVITY, 1))
+    die_codec(&codec[1], "Failed to set noise_sensitivity");
+  for (i = 2; i < NUM_ENCODERS; i++) {
     if (vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
       die_codec(&codec[i], "Failed to set noise_sensitivity");
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
index fa2df7271b2..0e409387b3e 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -679,7 +679,7 @@ int main(int argc, const char **argv) {
   }
 #if OUTPUT_RC_STATS
   // For now, just write temporal layer streams.
-  // TODO(wonkap): do spatial by re-writing superframe.
+  // TODO(marpan): do spatial by re-writing superframe.
   if (svc_ctx.output_rc_stat) {
     for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
       char file_name[PATH_MAX];
@@ -770,7 +770,7 @@ int main(int argc, const char **argv) {
                                          cx_pkt->data.frame.sz,
                                          cx_pkt->data.frame.pts);
 #if OUTPUT_RC_STATS
-            // TODO(marpan/wonkap): Put this (to line728) in separate function.
+            // TODO(marpan): Put this (to line728) in separate function.
             if (svc_ctx.output_rc_stat) {
               vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
               parse_superframe_index(cx_pkt->data.frame.buf,
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
index 752c1baead1..b9069808350 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -702,11 +702,14 @@ int main(int argc, char **argv) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
     vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+    vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
     vpx_svc_extra_cfg_t svc_params;
     memset(&svc_params, 0, sizeof(svc_params));
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+    vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0);
+    vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
     vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
     vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
diff --git a/chromium/third_party/libvpx/source/libvpx/libs.mk b/chromium/third_party/libvpx/source/libvpx/libs.mk
index f4f48cc1621..e0a2cc097de 100644
--- a/chromium/third_party/libvpx/source/libvpx/libs.mk
+++ b/chromium/third_party/libvpx/source/libvpx/libs.mk
@@ -391,7 +391,7 @@ LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS))
 LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX)
 LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
                      $(call enabled,LIBVPX_TEST_DATA))
-libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
+libvpx_test_data_url=https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/$(1)
 
 TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX)
 TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
@@ -405,7 +405,7 @@ CLEAN-OBJS += libvpx_test_srcs.txt
 $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
 	@echo "    [DOWNLOAD] $@"
 	$(qexec)trap 'rm -f $@' INT TERM &&\
-            curl -L -o $@ $(call libvpx_test_data_url,$(@F))
+            curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))
 
 testdata:: $(LIBVPX_TEST_DATA)
 	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
diff --git a/chromium/third_party/libvpx/source/libvpx/tools_common.h b/chromium/third_party/libvpx/source/libvpx/tools_common.h
index 73ba1bc03ba..c4a48b24de0 100644
--- a/chromium/third_party/libvpx/source/libvpx/tools_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/tools_common.h
@@ -26,11 +26,21 @@
 /* MSVS uses _f{seek,tell}i64. */
 #define fseeko _fseeki64
 #define ftello _ftelli64
+typedef int64_t FileOffset;
 #elif defined(_WIN32)
 /* MinGW uses f{seek,tell}o64 for large files. */
 #define fseeko fseeko64
 #define ftello ftello64
-#endif /* _WIN32 */
+typedef off64_t FileOffset;
+#elif CONFIG_OS_SUPPORT
+typedef off_t FileOffset;
+/* Use 32-bit file operations in WebM file format when building ARM
+ * executables (.axf) with RVCT. */
+#else
+#define fseeko fseek
+#define ftello ftell
+typedef long FileOffset /* NOLINT */
+#endif /* CONFIG_OS_SUPPORT */
 
 #if CONFIG_OS_SUPPORT
 #if defined(_MSC_VER)
@@ -42,13 +52,6 @@
 #endif              /* _MSC_VER */
 #endif              /* CONFIG_OS_SUPPORT */
 
-/* Use 32-bit file operations in WebM file format when building ARM
- * executables (.axf) with RVCT. */
-#if !CONFIG_OS_SUPPORT
-#define fseeko fseek
-#define ftello ftell
-#endif /* CONFIG_OS_SUPPORT */
-
 #define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
 
 #ifndef PATH_MAX
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
index 7612024b7d0..2de343419ac 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
@@ -1469,6 +1469,7 @@ void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr,
   unsigned char src_ptr_r2;
   unsigned char src_ptr_r3;
   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+  (void)output_width;
 
   vector4a = 64;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
index b79af1cc88f..d2c34425156 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
@@ -306,6 +306,7 @@ void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p,
   uint32_t hev;
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  (void)count;
 
   mask = 0;
   hev = 0;
@@ -498,6 +499,7 @@ void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
   uint32_t hev;
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  (void)count;
 
   mask = 0;
   hev = 0;
@@ -918,6 +920,7 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *s1, *s2, *s3, *s4;
   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  (void)count;
 
   /* loop filter designed to work using chars so that we can make maximum use
    * of 8 bit simd instructions.
@@ -1612,6 +1615,7 @@ void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
   uint32_t mask, hev;
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  (void)count;
 
   mask = 0;
   hev = 0;
@@ -1915,6 +1919,7 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *s1, *s2, *s3, *s4;
   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  (void)count;
 
   mask = 0;
   hev = 0;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h
index 713f5dffe09..96e3af6c9c1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h
@@ -19,14 +19,7 @@ enum {
   VP8D_DEBLOCK = 1 << 0,
   VP8D_DEMACROBLOCK = 1 << 1,
   VP8D_ADDNOISE = 1 << 2,
-  VP8D_DEBUG_TXT_FRAME_INFO = 1 << 3,
-  VP8D_DEBUG_TXT_MBLK_MODES = 1 << 4,
-  VP8D_DEBUG_TXT_DC_DIFF = 1 << 5,
-  VP8D_DEBUG_TXT_RATE_INFO = 1 << 6,
-  VP8D_DEBUG_DRAW_MV = 1 << 7,
-  VP8D_DEBUG_CLR_BLK_MODES = 1 << 8,
-  VP8D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
-  VP8D_MFQE = 1 << 10
+  VP8D_MFQE = 1 << 3
 };
 
 typedef struct {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
index c0e95b15a0f..bc5e0579999 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
@@ -210,8 +210,9 @@ $vp8_full_search_sad_sse3=vp8_full_search_sadx3;
 $vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
 
 add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_refining_search_sad sse3/;
-$vp8_refining_search_sad_sse3=vp8_refining_search_sadx4;
+specialize qw/vp8_refining_search_sad sse2 msa/;
+$vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
+$vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
 
 add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
 specialize qw/vp8_diamond_search_sad sse2 msa/;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h
index f27b209c40e..ece64f3fb43 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h
@@ -191,8 +191,47 @@ static inline int sem_destroy(sem_t *sem) {
 #define x86_pause_hint()
 #endif
 
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define USE_MUTEX_LOCK 1
+#endif
+#endif
+
 #include "vpx_util/vpx_thread.h"
 
+static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) {
+  (void)mutex;
+#if defined(USE_MUTEX_LOCK)
+  int ret;
+  pthread_mutex_lock(mutex);
+  ret = *p;
+  pthread_mutex_unlock(mutex);
+  return ret;
+#endif
+  return *p;
+}
+
+static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col,
+                             const int *last_row_current_mb_col,
+                             const int nsync) {
+  while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) {
+    x86_pause_hint();
+    thread_sleep(0);
+  }
+}
+
+static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) {
+  (void)mutex;
+#if defined(USE_MUTEX_LOCK)
+  pthread_mutex_lock(mutex);
+  *p = v;
+  pthread_mutex_unlock(mutex);
+  return;
+#endif
+  *p = v;
+}
+
+#undef USE_MUTEX_LOCK
 #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h
index e50fafd4f94..88b1ff16bca 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h
@@ -67,7 +67,8 @@ typedef struct VP8D_COMP {
 
 #if CONFIG_MULTITHREAD
   /* variable for threading */
-  volatile int b_multithreaded_rd;
+
+  int b_multithreaded_rd;
   int max_threads;
   int current_mb_col_main;
   unsigned int decoding_thread_count;
@@ -76,6 +77,8 @@ typedef struct VP8D_COMP {
   int mt_baseline_filter_level[MAX_MB_SEGMENTS];
   int sync_range;
   int *mt_current_mb_col; /* Each row remembers its already decoded column. */
+  pthread_mutex_t *pmutex;
+  pthread_mutex_t mt_mutex; /* mutex for b_multithreaded_rd */
 
   unsigned char **mt_yabove_row; /* mb_rows x width */
   unsigned char **mt_uabove_row;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
index 44ca16bfdd4..9f77519882c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
@@ -50,9 +50,6 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
     mbd->subpixel_predict8x8 = xd->subpixel_predict8x8;
     mbd->subpixel_predict16x16 = xd->subpixel_predict16x16;
 
-    mbd->mode_info_context = pc->mi + pc->mode_info_stride * (i + 1);
-    mbd->mode_info_stride = pc->mode_info_stride;
-
     mbd->frame_type = pc->frame_type;
     mbd->pre = xd->pre;
     mbd->dst = xd->dst;
@@ -251,8 +248,8 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
 static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
                               int start_mb_row) {
-  volatile const int *last_row_current_mb_col;
-  volatile int *current_mb_col;
+  const int *last_row_current_mb_col;
+  int *current_mb_col;
   int mb_row;
   VP8_COMMON *pc = &pbi->common;
   const int nsync = pbi->sync_range;
@@ -289,6 +286,9 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
   xd->up_available = (start_mb_row != 0);
 
+  xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row;
+  xd->mode_info_stride = pc->mode_info_stride;
+
   for (mb_row = start_mb_row; mb_row < pc->mb_rows;
        mb_row += (pbi->decoding_thread_count + 1)) {
     int recon_yoffset, recon_uvoffset;
@@ -318,7 +318,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
     xd->left_available = 0;
 
-    xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+    xd->mb_to_top_edge = -((mb_row * 16) << 3);
     xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
 
     if (pbi->common.filter_level) {
@@ -355,14 +355,15 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
                              xd->dst.uv_stride);
     }
 
-    for (mb_col = 0; mb_col < pc->mb_cols; mb_col++) {
-      *current_mb_col = mb_col - 1;
+    for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) {
+      if (((mb_col - 1) % nsync) == 0) {
+        pthread_mutex_t *mutex = &pbi->pmutex[mb_row];
+        protected_write(mutex, current_mb_col, mb_col - 1);
+      }
 
-      if ((mb_col & (nsync - 1)) == 0) {
-        while (mb_col > (*last_row_current_mb_col - nsync)) {
-          x86_pause_hint();
-          thread_sleep(0);
-        }
+      if (mb_row && !(mb_col & (nsync - 1))) {
+        pthread_mutex_t *mutex = &pbi->pmutex[mb_row - 1];
+        sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
       }
 
       /* Distance of MB to the various image edges.
@@ -548,7 +549,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
     }
 
     /* last MB of row is ready just after extension is done */
-    *current_mb_col = mb_col + nsync;
+    protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync);
 
     ++xd->mode_info_context; /* skip prediction column */
     xd->up_available = 1;
@@ -568,10 +569,10 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
   ENTROPY_CONTEXT_PLANES mb_row_left_context;
 
   while (1) {
-    if (pbi->b_multithreaded_rd == 0) break;
+    if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) break;
 
     if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
-      if (pbi->b_multithreaded_rd == 0) {
+      if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) {
         break;
       } else {
         MACROBLOCKD *xd = &mbrd->mbd;
@@ -591,6 +592,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
 
   pbi->b_multithreaded_rd = 0;
   pbi->allocated_decoding_thread_count = 0;
+  pthread_mutex_init(&pbi->mt_mutex, NULL);
 
   /* limit decoding threads to the max number of token partitions */
   core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
@@ -647,6 +649,16 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
 void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) {
   int i;
 
+  /* De-allocate mutex */
+  if (pbi->pmutex != NULL) {
+    for (i = 0; i < mb_rows; ++i) {
+      pthread_mutex_destroy(&pbi->pmutex[i]);
+    }
+
+    vpx_free(pbi->pmutex);
+    pbi->pmutex = NULL;
+  }
+
   vpx_free(pbi->mt_current_mb_col);
   pbi->mt_current_mb_col = NULL;
 
@@ -712,7 +724,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
   int i;
   int uv_width;
 
-  if (pbi->b_multithreaded_rd) {
+  if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) {
     vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
 
     /* our internal buffers are always multiples of 16 */
@@ -730,6 +742,15 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
     uv_width = width >> 1;
 
+    /* Allocate mutex */
+    CHECK_MEM_ERROR(pbi->pmutex,
+                    vpx_malloc(sizeof(*pbi->pmutex) * pc->mb_rows));
+    if (pbi->pmutex) {
+      for (i = 0; i < pc->mb_rows; ++i) {
+        pthread_mutex_init(&pbi->pmutex[i], NULL);
+      }
+    }
+
     /* Allocate an int for each mb row. */
     CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
 
@@ -772,9 +793,9 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
 void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
   /* shutdown MB Decoding thread; */
-  if (pbi->b_multithreaded_rd) {
+  if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) {
     int i;
-    pbi->b_multithreaded_rd = 0;
+    protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0);
 
     /* allow all threads to exit */
     for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
@@ -804,6 +825,7 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
 
     vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
   }
+  pthread_mutex_destroy(&pbi->mt_mutex);
 }
 
 void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
index e41d513c1b7..c7ad3bfe2c9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
@@ -345,8 +345,8 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
 #if CONFIG_MULTITHREAD
   const int nsync = cpi->mt_sync_range;
   const int rightmost_col = cm->mb_cols + nsync;
-  volatile const int *last_row_current_mb_col;
-  volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+  const int *last_row_current_mb_col;
+  int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
   if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) {
     last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
@@ -419,13 +419,14 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
 
 #if CONFIG_MULTITHREAD
     if (cpi->b_multi_threaded != 0) {
-      *current_mb_col = mb_col - 1; /* set previous MB done */
+      if (((mb_col - 1) % nsync) == 0) {
+        pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
+        protected_write(mutex, current_mb_col, mb_col - 1);
+      }
 
-      if ((mb_col & (nsync - 1)) == 0) {
-        while (mb_col > (*last_row_current_mb_col - nsync)) {
-          x86_pause_hint();
-          thread_sleep(0);
-        }
+      if (mb_row && !(mb_col & (nsync - 1))) {
+        pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1];
+        sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
       }
     }
 #endif
@@ -565,7 +566,9 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
                     xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
 #if CONFIG_MULTITHREAD
-  if (cpi->b_multi_threaded != 0) *current_mb_col = rightmost_col;
+  if (cpi->b_multi_threaded != 0) {
+    protected_write(&cpi->pmutex[mb_row], current_mb_col, rightmost_col);
+  }
 #endif
 
   /* this is to account for the border */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
index 708002b1e67..df34997accd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
@@ -25,11 +25,11 @@ static THREAD_FUNCTION thread_loopfilter(void *p_data) {
   VP8_COMMON *cm = &cpi->common;
 
   while (1) {
-    if (cpi->b_multi_threaded == 0) break;
+    if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
 
     if (sem_wait(&cpi->h_event_start_lpf) == 0) {
       /* we're shutting down */
-      if (cpi->b_multi_threaded == 0) break;
+      if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
 
       vp8_loopfilter_frame(cpi, cm);
 
@@ -47,7 +47,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
   ENTROPY_CONTEXT_PLANES mb_row_left_context;
 
   while (1) {
-    if (cpi->b_multi_threaded == 0) break;
+    if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
 
     if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
       const int nsync = cpi->mt_sync_range;
@@ -65,7 +65,10 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
       int *totalrate = &mbri->totalrate;
 
       /* we're shutting down */
-      if (cpi->b_multi_threaded == 0) break;
+      if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+
+      xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1);
+      xd->mode_info_stride = cm->mode_info_stride;
 
       for (mb_row = ithread + 1; mb_row < cm->mb_rows;
            mb_row += (cpi->encoding_thread_count + 1)) {
@@ -76,8 +79,8 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
         int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
         int map_index = (mb_row * cm->mb_cols);
-        volatile const int *last_row_current_mb_col;
-        volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+        const int *last_row_current_mb_col;
+        int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
 #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
         vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
@@ -103,13 +106,14 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
 
         /* for each macroblock col in image */
         for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
-          *current_mb_col = mb_col - 1;
+          if (((mb_col - 1) % nsync) == 0) {
+            pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
+            protected_write(mutex, current_mb_col, mb_col - 1);
+          }
 
-          if ((mb_col & (nsync - 1)) == 0) {
-            while (mb_col > (*last_row_current_mb_col - nsync)) {
-              x86_pause_hint();
-              thread_sleep(0);
-            }
+          if (mb_row && !(mb_col & (nsync - 1))) {
+            pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1];
+            sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
           }
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -281,7 +285,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16,
                           xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
-        *current_mb_col = mb_col + nsync;
+        protected_write(&cpi->pmutex[mb_row], current_mb_col, mb_col + nsync);
 
         /* this is to account for the border */
         xd->mode_info_context++;
@@ -450,9 +454,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
 
     mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1);
 
-    mbd->mode_info_context = cm->mi + x->e_mbd.mode_info_stride * (i + 1);
-    mbd->mode_info_stride = cm->mode_info_stride;
-
     mbd->frame_type = cm->frame_type;
 
     mb->src = *cpi->Source;
@@ -492,6 +493,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
   cpi->encoding_thread_count = 0;
   cpi->b_lpf_running = 0;
 
+  pthread_mutex_init(&cpi->mt_mutex, NULL);
+
   if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) {
     int ithread;
     int th_count = cpi->oxcf.multi_threaded - 1;
@@ -551,7 +554,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
     if (rc) {
       /* shutdown other threads */
-      cpi->b_multi_threaded = 0;
+      protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
       for (--ithread; ithread >= 0; ithread--) {
         pthread_join(cpi->h_encoding_thread[ithread], 0);
         sem_destroy(&cpi->h_event_start_encoding[ithread]);
@@ -565,6 +568,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
       vpx_free(cpi->mb_row_ei);
       vpx_free(cpi->en_thread_data);
 
+      pthread_mutex_destroy(&cpi->mt_mutex);
+
       return -1;
     }
 
@@ -579,7 +584,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
       if (rc) {
         /* shutdown other threads */
-        cpi->b_multi_threaded = 0;
+        protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
         for (--ithread; ithread >= 0; ithread--) {
           sem_post(&cpi->h_event_start_encoding[ithread]);
           sem_post(&cpi->h_event_end_encoding[ithread]);
@@ -597,6 +602,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
         vpx_free(cpi->mb_row_ei);
         vpx_free(cpi->en_thread_data);
 
+        pthread_mutex_destroy(&cpi->mt_mutex);
+
         return -2;
       }
     }
@@ -605,9 +612,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 }
 
 void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
-  if (cpi->b_multi_threaded) {
+  if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded)) {
     /* shutdown other threads */
-    cpi->b_multi_threaded = 0;
+    protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
     {
       int i;
 
@@ -635,5 +642,6 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
     vpx_free(cpi->mb_row_ei);
     vpx_free(cpi->en_thread_data);
   }
+  pthread_mutex_destroy(&cpi->mt_mutex);
 }
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
index c5389594553..9717feb136b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
@@ -446,6 +446,18 @@ static void dealloc_compressor_data(VP8_COMP *cpi) {
   cpi->mb.pip = 0;
 
 #if CONFIG_MULTITHREAD
+  /* De-allocate mutex */
+  if (cpi->pmutex != NULL) {
+    VP8_COMMON *const pc = &cpi->common;
+    int i;
+
+    for (i = 0; i < pc->mb_rows; ++i) {
+      pthread_mutex_destroy(&cpi->pmutex[i]);
+    }
+    vpx_free(cpi->pmutex);
+    cpi->pmutex = NULL;
+  }
+
   vpx_free(cpi->mt_current_mb_col);
   cpi->mt_current_mb_col = NULL;
 #endif
@@ -1075,6 +1087,9 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
 
   int width = cm->Width;
   int height = cm->Height;
+#if CONFIG_MULTITHREAD
+  int prev_mb_rows = cm->mb_rows;
+#endif
 
   if (vp8_alloc_frame_buffers(cm, width, height)) {
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
@@ -1164,6 +1179,25 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
   }
 
   if (cpi->oxcf.multi_threaded > 1) {
+    int i;
+
+    /* De-allocate and re-allocate mutex */
+    if (cpi->pmutex != NULL) {
+      for (i = 0; i < prev_mb_rows; ++i) {
+        pthread_mutex_destroy(&cpi->pmutex[i]);
+      }
+      vpx_free(cpi->pmutex);
+      cpi->pmutex = NULL;
+    }
+
+    CHECK_MEM_ERROR(cpi->pmutex,
+                    vpx_malloc(sizeof(*cpi->pmutex) * cm->mb_rows));
+    if (cpi->pmutex) {
+      for (i = 0; i < cm->mb_rows; ++i) {
+        pthread_mutex_init(&cpi->pmutex[i], NULL);
+      }
+    }
+
     vpx_free(cpi->mt_current_mb_col);
     CHECK_MEM_ERROR(cpi->mt_current_mb_col,
                     vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
index bfcc6457c19..fe775064a45 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
@@ -511,6 +511,8 @@ typedef struct VP8_COMP {
 
 #if CONFIG_MULTITHREAD
   /* multithread data */
+  pthread_mutex_t *pmutex;
+  pthread_mutex_t mt_mutex; /* mutex for b_multi_threaded */
   int *mt_current_mb_col;
   int mt_sync_range;
   int b_multi_threaded;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
index cbd61de90ab..dd1ea03b6b9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -77,10 +77,10 @@ static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
   q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
   q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
 
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q10s32, 14);
+  d26s16 = vrshrn_n_s32(q13s32, 14);
+  d27s16 = vrshrn_n_s32(q14s32, 14);
+  d29s16 = vrshrn_n_s32(q15s32, 14);
+  d28s16 = vrshrn_n_s32(q10s32, 14);
 
   q13s16 = vcombine_s16(d26s16, d27s16);
   q14s16 = vcombine_s16(d28s16, d29s16);
@@ -125,17 +125,17 @@ static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
   q14s32 = vaddq_s32(q11s32, q12s32);
   q10s32 = vsubq_s32(q10s32, q12s32);
 
-  d16s16 = vqrshrn_n_s32(q13s32, 14);
-  d17s16 = vqrshrn_n_s32(q14s32, 14);
-  d18s16 = vqrshrn_n_s32(q15s32, 14);
-  d19s16 = vqrshrn_n_s32(q10s32, 14);
+  d16s16 = vrshrn_n_s32(q13s32, 14);
+  d17s16 = vrshrn_n_s32(q14s32, 14);
+  d18s16 = vrshrn_n_s32(q15s32, 14);
+  d19s16 = vrshrn_n_s32(q10s32, 14);
 
   *q8s16 = vcombine_s16(d16s16, d17s16);
   *q9s16 = vcombine_s16(d18s16, d19s16);
 }
 
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
-                            int dest_stride, int tx_type) {
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   uint8x8_t d26u8, d27u8;
   int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
   uint32x2_t d26u32, d27u32;
@@ -151,7 +151,7 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+      vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
       return;
     case 1:  // iadst_idct
       // generate constants
@@ -203,11 +203,11 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
   q9s16 = vrshrq_n_s16(q9s16, 4);
 
   d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
-  dest += dest_stride;
+  dest += stride;
   d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
-  dest += dest_stride;
+  dest += stride;
   d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
-  dest += dest_stride;
+  dest += stride;
   d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
 
   q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
@@ -217,10 +217,10 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
   d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
 
   vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
-  dest -= dest_stride;
+  dest -= stride;
   vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
-  dest -= dest_stride;
+  dest -= stride;
   vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
-  dest -= dest_stride;
+  dest -= stride;
   vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
index f7e0a6d9817..1c739861c38 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -76,10 +76,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
   q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
 
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
+  d8s16 = vrshrn_n_s32(q2s32, 14);
+  d9s16 = vrshrn_n_s32(q3s32, 14);
+  d10s16 = vrshrn_n_s32(q5s32, 14);
+  d11s16 = vrshrn_n_s32(q6s32, 14);
   q4s16 = vcombine_s16(d8s16, d9s16);
   q5s16 = vcombine_s16(d10s16, d11s16);
 
@@ -93,10 +93,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
   q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
 
-  d14s16 = vqrshrn_n_s32(q2s32, 14);
-  d15s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q13s32, 14);
+  d14s16 = vrshrn_n_s32(q2s32, 14);
+  d15s16 = vrshrn_n_s32(q3s32, 14);
+  d12s16 = vrshrn_n_s32(q9s32, 14);
+  d13s16 = vrshrn_n_s32(q13s32, 14);
   q6s16 = vcombine_s16(d12s16, d13s16);
   q7s16 = vcombine_s16(d14s16, d15s16);
 
@@ -115,10 +115,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   d0s16 = vdup_n_s16(cospi_24_64);
   d1s16 = vdup_n_s16(cospi_8_64);
 
-  d18s16 = vqrshrn_n_s32(q2s32, 14);
-  d19s16 = vqrshrn_n_s32(q3s32, 14);
-  d22s16 = vqrshrn_n_s32(q13s32, 14);
-  d23s16 = vqrshrn_n_s32(q15s32, 14);
+  d18s16 = vrshrn_n_s32(q2s32, 14);
+  d19s16 = vrshrn_n_s32(q3s32, 14);
+  d22s16 = vrshrn_n_s32(q13s32, 14);
+  d23s16 = vrshrn_n_s32(q15s32, 14);
   *q9s16 = vcombine_s16(d18s16, d19s16);
   *q11s16 = vcombine_s16(d22s16, d23s16);
 
@@ -132,10 +132,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
   q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
 
-  d26s16 = vqrshrn_n_s32(q2s32, 14);
-  d27s16 = vqrshrn_n_s32(q3s32, 14);
-  d30s16 = vqrshrn_n_s32(q8s32, 14);
-  d31s16 = vqrshrn_n_s32(q12s32, 14);
+  d26s16 = vrshrn_n_s32(q2s32, 14);
+  d27s16 = vrshrn_n_s32(q3s32, 14);
+  d30s16 = vrshrn_n_s32(q8s32, 14);
+  d31s16 = vrshrn_n_s32(q12s32, 14);
   *q13s16 = vcombine_s16(d26s16, d27s16);
   *q15s16 = vcombine_s16(d30s16, d31s16);
 
@@ -165,10 +165,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
   q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
 
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
+  d10s16 = vrshrn_n_s32(q9s32, 14);
+  d11s16 = vrshrn_n_s32(q10s32, 14);
+  d12s16 = vrshrn_n_s32(q11s32, 14);
+  d13s16 = vrshrn_n_s32(q12s32, 14);
   q5s16 = vcombine_s16(d10s16, d11s16);
   q6s16 = vcombine_s16(d12s16, d13s16);
 
@@ -242,8 +242,8 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q1s32 = vsubq_s32(q1s32, q5s32);
   q2s32 = vsubq_s32(q2s32, q6s32);
 
-  d22s16 = vqrshrn_n_s32(q11s32, 14);
-  d23s16 = vqrshrn_n_s32(q12s32, 14);
+  d22s16 = vrshrn_n_s32(q11s32, 14);
+  d23s16 = vrshrn_n_s32(q12s32, 14);
   *q11s16 = vcombine_s16(d22s16, d23s16);
 
   q12s32 = vaddq_s32(q3s32, q7s32);
@@ -251,12 +251,12 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q3s32 = vsubq_s32(q3s32, q7s32);
   q4s32 = vsubq_s32(q4s32, q8s32);
 
-  d2s16 = vqrshrn_n_s32(q1s32, 14);
-  d3s16 = vqrshrn_n_s32(q2s32, 14);
-  d24s16 = vqrshrn_n_s32(q12s32, 14);
-  d25s16 = vqrshrn_n_s32(q15s32, 14);
-  d6s16 = vqrshrn_n_s32(q3s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
+  d2s16 = vrshrn_n_s32(q1s32, 14);
+  d3s16 = vrshrn_n_s32(q2s32, 14);
+  d24s16 = vrshrn_n_s32(q12s32, 14);
+  d25s16 = vrshrn_n_s32(q15s32, 14);
+  d6s16 = vrshrn_n_s32(q3s32, 14);
+  d7s16 = vrshrn_n_s32(q4s32, 14);
   *q12s16 = vcombine_s16(d24s16, d25s16);
 
   d0s16 = vdup_n_s16(cospi_10_64);
@@ -291,10 +291,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q2s32 = vsubq_s32(q2s32, q10s32);
   q6s32 = vsubq_s32(q6s32, q9s32);
 
-  d28s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d4s16 = vqrshrn_n_s32(q2s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
+  d28s16 = vrshrn_n_s32(q14s32, 14);
+  d29s16 = vrshrn_n_s32(q15s32, 14);
+  d4s16 = vrshrn_n_s32(q2s32, 14);
+  d5s16 = vrshrn_n_s32(q6s32, 14);
   *q14s16 = vcombine_s16(d28s16, d29s16);
 
   q9s32 = vaddq_s32(q4s32, q0s32);
@@ -305,10 +305,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   d30s16 = vdup_n_s16(cospi_8_64);
   d31s16 = vdup_n_s16(cospi_24_64);
 
-  d18s16 = vqrshrn_n_s32(q9s32, 14);
-  d19s16 = vqrshrn_n_s32(q10s32, 14);
-  d8s16 = vqrshrn_n_s32(q4s32, 14);
-  d9s16 = vqrshrn_n_s32(q5s32, 14);
+  d18s16 = vrshrn_n_s32(q9s32, 14);
+  d19s16 = vrshrn_n_s32(q10s32, 14);
+  d8s16 = vrshrn_n_s32(q4s32, 14);
+  d9s16 = vrshrn_n_s32(q5s32, 14);
   *q9s16 = vcombine_s16(d18s16, d19s16);
 
   q5s32 = vmull_s16(d2s16, d30s16);
@@ -341,10 +341,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q5s32 = vsubq_s32(q5s32, q1s32);
   q6s32 = vsubq_s32(q6s32, q3s32);
 
-  d18s16 = vqrshrn_n_s32(q14s32, 14);
-  d19s16 = vqrshrn_n_s32(q15s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
+  d18s16 = vrshrn_n_s32(q14s32, 14);
+  d19s16 = vrshrn_n_s32(q15s32, 14);
+  d10s16 = vrshrn_n_s32(q5s32, 14);
+  d11s16 = vrshrn_n_s32(q6s32, 14);
   *q9s16 = vcombine_s16(d18s16, d19s16);
 
   q1s32 = vaddq_s32(q7s32, q10s32);
@@ -352,10 +352,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q7s32 = vsubq_s32(q7s32, q10s32);
   q0s32 = vsubq_s32(q0s32, q2s32);
 
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  d29s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q7s32, 14);
-  d15s16 = vqrshrn_n_s32(q0s32, 14);
+  d28s16 = vrshrn_n_s32(q1s32, 14);
+  d29s16 = vrshrn_n_s32(q3s32, 14);
+  d14s16 = vrshrn_n_s32(q7s32, 14);
+  d15s16 = vrshrn_n_s32(q0s32, 14);
   *q14s16 = vcombine_s16(d28s16, d29s16);
 
   d30s16 = vdup_n_s16(cospi_16_64);
@@ -374,10 +374,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
   q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
 
-  d4s16 = vqrshrn_n_s32(q2s32, 14);
-  d5s16 = vqrshrn_n_s32(q3s32, 14);
-  d24s16 = vqrshrn_n_s32(q13s32, 14);
-  d25s16 = vqrshrn_n_s32(q1s32, 14);
+  d4s16 = vrshrn_n_s32(q2s32, 14);
+  d5s16 = vrshrn_n_s32(q3s32, 14);
+  d24s16 = vrshrn_n_s32(q13s32, 14);
+  d25s16 = vrshrn_n_s32(q1s32, 14);
   q2s16 = vcombine_s16(d4s16, d5s16);
   *q12s16 = vcombine_s16(d24s16, d25s16);
 
@@ -391,10 +391,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
   q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
 
-  d20s16 = vqrshrn_n_s32(q13s32, 14);
-  d21s16 = vqrshrn_n_s32(q1s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q0s32, 14);
+  d20s16 = vrshrn_n_s32(q13s32, 14);
+  d21s16 = vrshrn_n_s32(q1s32, 14);
+  d12s16 = vrshrn_n_s32(q11s32, 14);
+  d13s16 = vrshrn_n_s32(q0s32, 14);
   *q10s16 = vcombine_s16(d20s16, d21s16);
   q6s16 = vcombine_s16(d12s16, d13s16);
 
@@ -406,8 +406,8 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   *q15s16 = vsubq_s16(q5s16, q4s16);
 }
 
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
-                            int dest_stride, int tx_type) {
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   int i;
   uint8_t *d1, *d2;
   uint8x8_t d0u8, d1u8, d2u8, d3u8;
@@ -429,7 +429,7 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+      vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
       return;
     case 1:  // iadst_idct
       // generate IDCT constants
@@ -508,13 +508,13 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
     }
 
     d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
+    d1 += stride;
     d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
+    d1 += stride;
     d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
+    d1 += stride;
     d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
+    d1 += stride;
 
     q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
     q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
@@ -529,12 +529,12 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
     d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
 
     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
+    d2 += stride;
     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
+    d2 += stride;
     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
+    d2 += stride;
     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
+    d2 += stride;
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
index 2d4839174db..f6b29265e66 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -21,8 +21,8 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride, int tx_type) {
+void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
   int i, j;
   DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
   int16_t *outptr = out;
@@ -37,7 +37,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
   switch (tx_type) {
     case DCT_DCT:  // DCT in both horizontal and vertical
       vpx_idct4_rows_dspr2(input, outptr);
-      vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride);
       break;
     case ADST_DCT:  // ADST in vertical, DCT in horizontal
       vpx_idct4_rows_dspr2(input, outptr);
@@ -48,8 +48,8 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
         iadst4_dspr2(outptr, temp_out);
 
         for (j = 0; j < 4; ++j)
-          dest[j * dest_stride + i] = clip_pixel(
-              ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]);
+          dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) +
+                                            dest[j * stride + i]);
 
         outptr += 4;
       }
@@ -66,7 +66,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
           temp_in[i * 4 + j] = out[j * 4 + i];
         }
       }
-      vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, stride);
       break;
     case ADST_ADST:  // ADST in both directions
       for (i = 0; i < 4; ++i) {
@@ -80,8 +80,8 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
         iadst4_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 4; ++j)
-          dest[j * dest_stride + i] = clip_pixel(
-              ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]);
+          dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) +
+                                            dest[j * stride + i]);
       }
       break;
     default: printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); break;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index 86896f04ca5..b945e307e63 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -20,8 +20,8 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride, int tx_type) {
+void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
   int i, j;
   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
   int16_t *outptr = out;
@@ -34,7 +34,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
   switch (tx_type) {
     case DCT_DCT:  // DCT in both horizontal and vertical
       idct8_rows_dspr2(input, outptr, 8);
-      idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      idct8_columns_add_blk_dspr2(&out[0], dest, stride);
       break;
     case ADST_DCT:  // ADST in vertical, DCT in horizontal
       idct8_rows_dspr2(input, outptr, 8);
@@ -43,8 +43,8 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
         iadst8_dspr2(&out[i * 8], temp_out);
 
         for (j = 0; j < 8; ++j)
-          dest[j * dest_stride + i] = clip_pixel(
-              ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]);
+          dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) +
+                                            dest[j * stride + i]);
       }
       break;
     case DCT_ADST:  // DCT in vertical, ADST in horizontal
@@ -59,7 +59,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
           temp_in[i * 8 + j] = out[j * 8 + i];
         }
       }
-      idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      idct8_columns_add_blk_dspr2(&temp_in[0], dest, stride);
       break;
     case ADST_ADST:  // ADST in both directions
       for (i = 0; i < 8; ++i) {
@@ -74,8 +74,8 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
         iadst8_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 8; ++j)
-          dest[j * dest_stride + i] = clip_pixel(
-              ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]);
+          dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) +
+                                            dest[j * stride + i]);
       }
       break;
     default: printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); break;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
index 6dcfa412bee..b8b647bf18d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
@@ -20,14 +20,7 @@ enum {
   VP9D_DEBLOCK = 1 << 0,
   VP9D_DEMACROBLOCK = 1 << 1,
   VP9D_ADDNOISE = 1 << 2,
-  VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3,
-  VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4,
-  VP9D_DEBUG_TXT_DC_DIFF = 1 << 5,
-  VP9D_DEBUG_TXT_RATE_INFO = 1 << 6,
-  VP9D_DEBUG_DRAW_MV = 1 << 7,
-  VP9D_DEBUG_CLR_BLK_MODES = 1 << 8,
-  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
-  VP9D_MFQE = 1 << 10
+  VP9D_MFQE = 1 << 3
 };
 
 typedef struct {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
index abef0676396..088b004f528 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -48,16 +48,16 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
   if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
     add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
   } else {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
     specialize qw/vp9_iht4x4_16_add sse2/;
 
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
     specialize qw/vp9_iht8x8_64_add sse2/;
 
     add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
@@ -66,16 +66,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 } else {
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
   if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
     add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
   } else {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
     specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;
 
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
     specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
@@ -101,9 +101,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
 
-  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
 
   add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index 072d92e4e91..3dc88b1914e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -128,16 +128,20 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   int bits_per_mb;
   int num8x8bl = cm->MBs << 2;
+  // Compute delta-q corresponding to qindex i.
+  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
   // Weight for segment prior to encoding: take the average of the target
   // number for the frame to be encoded and the actual from the previous frame.
+  // Use the target if its less.
   int target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  double weight_segment_target = (double)(target_refresh) / num8x8bl;
   double weight_segment =
       (double)((target_refresh + cr->actual_num_seg1_blocks +
                 cr->actual_num_seg2_blocks) >>
                1) /
       num8x8bl;
-  // Compute delta-q corresponding to qindex i.
-  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  if (weight_segment_target < 7 * weight_segment / 8)
+    weight_segment = weight_segment_target;
   // Take segment weighted average for bits per mb.
   bits_per_mb = (int)((1.0 - weight_segment) *
                           vp9_rc_bits_per_mb(cm->frame_type, i,
@@ -383,13 +387,14 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
           : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
   // More aggressive settings for noisy content.
   if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
-    consec_zero_mv_thresh = 80;
+    consec_zero_mv_thresh = 60;
     qindex_thresh =
         VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
-               7 * cm->base_qindex >> 3);
+               cm->base_qindex);
   }
   do {
     int sum_map = 0;
+    int consec_zero_mv_thresh_block = consec_zero_mv_thresh;
     // Get the mi_row/mi_col corresponding to superblock index i.
     int sb_row_index = (i / sb_cols);
     int sb_col_index = i - sb_row_index * sb_cols;
@@ -403,6 +408,9 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
         VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
     ymis =
         VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
+    if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium &&
+        (xmis <= 2 || ymis <= 2))
+      consec_zero_mv_thresh_block = 10;
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
         const int bl_index2 = bl_index + y * cm->mi_cols + x;
@@ -412,7 +420,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
         if (cr->map[bl_index2] == 0) {
           count_tot++;
           if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
-              cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
+              cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) {
             sum_map++;
             count_sel++;
           }
@@ -468,8 +476,8 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
   }
   // Adjust some parameters for low resolutions at low bitrates.
   if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) {
-    cr->motion_thresh = 4;
-    cr->rate_boost_fac = 10;
+    cr->motion_thresh = 16;
+    cr->rate_boost_fac = 13;
   }
   if (cpi->svc.spatial_layer_id > 0) {
     cr->motion_thresh = 4;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
index 3ab05375ff7..323c053edff 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -477,8 +477,8 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
     thresholds[2] = threshold_base >> 2;
     thresholds[3] = threshold_base << 2;
   } else {
-    // Increase base variance threshold based on  estimated noise level.
-    if (cpi->noise_estimate.enabled) {
+    // Increase base variance threshold based on estimated noise level.
+    if (cpi->noise_estimate.enabled && cm->width >= 640 && cm->height >= 480) {
       NOISE_LEVEL noise_level =
           vp9_noise_estimate_extract_level(&cpi->noise_estimate);
       if (noise_level == kHigh)
@@ -526,6 +526,7 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
                                      : 1000;
       cpi->vbp_bsize_min = BLOCK_16X16;
     }
+    cpi->vbp_threshold_copy = cpi->vbp_thresholds[0] << 16;
     cpi->vbp_threshold_minmax = 15 + (q >> 3);
   }
 }
@@ -742,9 +743,13 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
           continue;
 
         if ((*this_mi)->sb_type == BLOCK_32X32) {
-          if (vt->split[i].part_variances.none.variance < (thresholds[1] >> 1))
+          int64_t threshold_32x32 = (cpi->sf.short_circuit_low_temp_var == 1 ||
+                                     cpi->sf.short_circuit_low_temp_var == 3)
+                                        ? ((5 * thresholds[1]) >> 3)
+                                        : (thresholds[1] >> 1);
+          if (vt->split[i].part_variances.none.variance < threshold_32x32)
             x->variance_low[i + 5] = 1;
-        } else if (cpi->sf.short_circuit_low_temp_var == 2) {
+        } else if (cpi->sf.short_circuit_low_temp_var >= 2) {
           // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
           // inside.
           if ((*this_mi)->sb_type == BLOCK_16X16 ||
@@ -762,6 +767,93 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
   }
 }
 
+static void copy_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                                int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE *prev_part = cpi->prev_partition;
+  int start_pos = mi_row * cm->mi_stride + mi_col;
+
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) / 4;
+  BLOCK_SIZE subsize;
+  PARTITION_TYPE partition;
+  MODE_INFO *mi = NULL;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  partition = partition_lookup[bsl][prev_part[start_pos]];
+  subsize = get_subsize(bsize, partition);
+  mi = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+
+  if (subsize < BLOCK_8X8) {
+    mi->sb_type = bsize;
+  } else {
+    switch (partition) {
+      case PARTITION_NONE: mi->sb_type = bsize; break;
+      case PARTITION_HORZ:
+        mi->sb_type = subsize;
+        if (mi_row + bs < cm->mi_rows)
+          cm->mi_grid_visible[(mi_row + bs) * cm->mi_stride + mi_col]->sb_type =
+              subsize;
+        break;
+      case PARTITION_VERT:
+        mi->sb_type = subsize;
+        if (mi_col + bs < cm->mi_cols)
+          cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col + bs]->sb_type =
+              subsize;
+        break;
+      case PARTITION_SPLIT:
+        copy_prev_partition(cpi, subsize, mi_row, mi_col);
+        copy_prev_partition(cpi, subsize, mi_row + bs, mi_col);
+        copy_prev_partition(cpi, subsize, mi_row, mi_col + bs);
+        copy_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                                  int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE *prev_part = cpi->prev_partition;
+  int start_pos = mi_row * cm->mi_stride + mi_col;
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) / 4;
+  BLOCK_SIZE subsize;
+  PARTITION_TYPE partition;
+  const MODE_INFO *mi = NULL;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  mi = cm->mi_grid_visible[start_pos];
+  partition = partition_lookup[bsl][mi->sb_type];
+  subsize = get_subsize(bsize, partition);
+  if (subsize < BLOCK_8X8) {
+    prev_part[start_pos] = bsize;
+  } else {
+    switch (partition) {
+      case PARTITION_NONE: prev_part[start_pos] = bsize; break;
+      case PARTITION_HORZ:
+        prev_part[start_pos] = subsize;
+        if (mi_row + bs < cm->mi_rows)
+          prev_part[start_pos + bs * cm->mi_stride] = subsize;
+        break;
+      case PARTITION_VERT:
+        prev_part[start_pos] = subsize;
+        if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
+        break;
+      case PARTITION_SPLIT:
+        update_prev_partition(cpi, subsize, mi_row, mi_col);
+        update_prev_partition(cpi, subsize, mi_row + bs, mi_col);
+        update_prev_partition(cpi, subsize, mi_row, mi_col + bs);
+        update_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
 static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
                          unsigned int y_sad, int is_key_frame) {
   int i;
@@ -824,6 +916,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[16];
   int segment_id;
+  int offset = cm->mi_stride * mi_row + mi_col;
 
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
   segment_id = xd->mi[0]->segment_id;
@@ -834,8 +927,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     }
   }
 
-  threshold_4x4avg =
-      (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : thresholds[2] >> 1;
+  // For non keyframes, disable 4x4 average for low resolution when speed = 8
+  threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX;
 
   memset(x->variance_low, 0, sizeof(x->variance_low));
 
@@ -857,7 +950,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
 
     const YV12_BUFFER_CONFIG *yv12_g = NULL;
-    unsigned int y_sad_g, y_sad_thr;
+    unsigned int y_sad_g, y_sad_thr, y_sad_last;
     bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 +
             (mi_row + 4 < cm->mi_rows);
 
@@ -897,6 +990,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     mi->interp_filter = BILINEAR;
 
     y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+    y_sad_last = y_sad;
     // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
     // are close if short_circuit_low_temp_var is on.
     y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad;
@@ -937,6 +1031,20 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
         return 0;
       }
     }
+
+    // If the y_sad is small enough, copy the partition of the superblock in the
+    // last frame to current frame only if the last frame is not a keyframe.
+    // TODO(jianj) : tune the threshold.
+    if (cpi->sf.copy_partition_flag && cpi->rc.frames_since_key > 1 &&
+        segment_id == CR_SEGMENT_ID_BASE &&
+        cpi->prev_segment_id[offset] == CR_SEGMENT_ID_BASE &&
+        y_sad_last < cpi->vbp_threshold_copy) {
+      if (cpi->prev_partition != NULL) {
+        copy_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col);
+        chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+        return 0;
+      }
+    }
   } else {
     d = VP9_VAR_OFFS;
     dp = 0;
@@ -1131,6 +1239,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     }
   }
 
+  if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) {
+    update_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col);
+    cpi->prev_segment_id[offset] = segment_id;
+  }
+
   if (cpi->sf.short_circuit_low_temp_var) {
     set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition,
                           mi_col, mi_row);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
index 20ebe68197e..2cb137d8b93 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
@@ -109,6 +109,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   int64_t error0, error1;
   int16_t t0, t1;
   EXTRABIT e0;
+  unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      mb->token_costs[tx_size][type][ref];
   int best, band, pt, i, final_eob;
 #if CONFIG_VP9_HIGHBITDEPTH
   const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
@@ -137,7 +139,6 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
     int x = qcoeff[rc];
     /* Only add a trellis state for non-zero coefficients. */
     if (x) {
-      int shortcut = 0;
       error0 = tokens[next][0].error;
       error1 = tokens[next][1].error;
       /* Evaluate the first possibility for this state. */
@@ -148,10 +149,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
       if (next < default_eob) {
         band = band_translate[i + 1];
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-        rate0 += mb->token_costs[tx_size][type][ref][band][0][pt]
-                                [tokens[next][0].token];
-        rate1 += mb->token_costs[tx_size][type][ref][band][0][pt]
-                                [tokens[next][1].token];
+        rate0 += token_costs[band][0][pt][tokens[next][0].token];
+        rate1 += token_costs[band][0][pt][tokens[next][1].token];
       }
       UPDATE_RD_COST();
       /* And pick the best. */
@@ -178,12 +177,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
 
       if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
           (abs(x) * dequant_ptr[rc != 0] <
-           (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0]))
-        shortcut = 1;
-      else
-        shortcut = 0;
-
-      if (shortcut) {
+           (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) {
         sz = -(x < 0);
         x -= 2 * sz + 1;
       } else {
@@ -208,13 +202,11 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
         band = band_translate[i + 1];
         if (t0 != EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-          rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
-                                  [tokens[next][0].token];
+          rate0 += token_costs[band][!x][pt][tokens[next][0].token];
         }
         if (t1 != EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
-          rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
-                                  [tokens[next][1].token];
+          rate1 += token_costs[band][!x][pt][tokens[next][1].token];
         }
       }
 
@@ -223,18 +215,17 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
       best = rd_cost1 < rd_cost0;
       base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
 
-      if (shortcut) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
-        } else {
-          dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-        }
-#else
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
+      } else {
         dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        d2 = dx * dx;
       }
+#else
+      dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      d2 = dx * dx;
+
       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][1].error = d2 + (best ? error1 : error0);
       tokens[i][1].next = next;
@@ -270,13 +261,11 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
       t1 = tokens[next][1].token;
       /* Update the cost of each path if we're past the EOB token. */
       if (t0 != EOB_TOKEN) {
-        tokens[next][0].rate +=
-            mb->token_costs[tx_size][type][ref][band][1][pt][t0];
+        tokens[next][0].rate += token_costs[band][1][pt][t0];
         tokens[next][0].token = ZERO_TOKEN;
       }
       if (t1 != EOB_TOKEN) {
-        tokens[next][1].rate +=
-            mb->token_costs[tx_size][type][ref][band][1][pt][t1];
+        tokens[next][1].rate += token_costs[band][1][pt][t1];
         tokens[next][1].token = ZERO_TOKEN;
       }
       tokens[i][0].best_index = tokens[i][1].best_index = 0;
@@ -292,8 +281,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   error1 = tokens[next][1].error;
   t0 = tokens[next][0].token;
   t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0];
-  rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1];
+  rate0 += token_costs[band][0][ctx][t0];
+  rate1 += token_costs[band][0][ctx][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = -1;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
index 2a58003829c..432eac8da00 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
@@ -108,7 +108,7 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) {
 }
 
 /* clang-format off */
-static const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
+const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
   { LEVEL_1,   829440,      36864,    200,    400,   2, 1,  4,  8 },
   { LEVEL_1_1, 2764800,     73728,    800,    1000,  2, 1,  4,  8 },
   { LEVEL_2,   4608000,     122880,   1800,   1500,  2, 1,  4,  8 },
@@ -128,6 +128,16 @@ static const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
 };
 /* clang-format on */
 
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] =
+    { "The average bit-rate is too high.",
+      "The picture size is too large.",
+      "The luma sample rate is too large.",
+      "The CPB size is too large.",
+      "The compression ratio is too small",
+      "Too many column tiles are used.",
+      "The alt-ref distance is too small.",
+      "Too many reference buffers are used." };
+
 static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -224,8 +234,9 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
 
   for (i = 0; i < VP9_LEVELS; ++i) {
     this_level = &vp9_level_defs[i];
-    if ((double)level_spec->max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P) >
-            (double)this_level->max_luma_sample_rate ||
+    if ((double)level_spec->max_luma_sample_rate >
+            (double)this_level->max_luma_sample_rate *
+                (1 + SAMPLE_RATE_GRACE_P) ||
         level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
         level_spec->average_bitrate > this_level->average_bitrate ||
         level_spec->max_cpb_size > this_level->max_cpb_size ||
@@ -439,6 +450,12 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   cpi->nmvsadcosts_hp[0] = NULL;
   cpi->nmvsadcosts_hp[1] = NULL;
 
+  vpx_free(cpi->prev_partition);
+  cpi->prev_partition = NULL;
+
+  vpx_free(cpi->prev_segment_id);
+  cpi->prev_segment_id = NULL;
+
   vp9_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
 
@@ -872,6 +889,22 @@ static void init_buffer_indices(VP9_COMP *cpi) {
   cpi->alt_fb_idx = 2;
 }
 
+static void init_level_constraint(LevelConstraint *lc) {
+  lc->level_index = -1;
+  lc->max_cpb_size = INT_MAX;
+  lc->max_frame_size = INT_MAX;
+  lc->rc_config_updated = 0;
+  lc->fail_flag = 0;
+}
+
+static void set_level_constraint(LevelConstraint *ls, int8_t level_index) {
+  vpx_clear_system_state();
+  ls->level_index = level_index;
+  if (level_index >= 0) {
+    ls->max_cpb_size = vp9_level_defs[level_index].max_cpb_size * (double)1000;
+  }
+}
+
 static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
 
@@ -887,6 +920,8 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
 
   cpi->target_level = oxcf->target_level;
   cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+  set_level_constraint(&cpi->level_constraint,
+                       get_level_index(cpi->target_level));
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
@@ -1403,6 +1438,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 
   cpi->target_level = oxcf->target_level;
   cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+  set_level_constraint(&cpi->level_constraint,
+                       get_level_index(cpi->target_level));
 
   if (cm->profile <= PROFILE_1)
     assert(cm->bit_depth == VPX_BITS_8);
@@ -1679,6 +1716,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 
   init_level_info(&cpi->level_info);
+  init_level_constraint(&cpi->level_constraint);
 
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_blockiness = 1;
@@ -3127,7 +3165,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   if (cpi->oxcf.pass == 0 && cpi->oxcf.mode == REALTIME &&
       cpi->oxcf.speed >= 5 && cpi->resize_state == 0 &&
       (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
-       cpi->oxcf.rc_mode == VPX_VBR) &&
+       cpi->oxcf.rc_mode == VPX_VBR || cpi->sf.copy_partition_flag) &&
       cm->show_frame)
     vp9_avg_source_sad(cpi);
 
@@ -3238,9 +3276,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
+  int enable_acl;
 
   set_size_independent_vars(cpi);
 
+  enable_acl = cpi->sf.allow_acl
+                   ? (cm->frame_type == KEY_FRAME) || (cm->show_frame == 0)
+                   : 0;
+
   do {
     vpx_clear_system_state();
 
@@ -3335,7 +3378,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size);
 
       rc->projected_frame_size = (int)(*size) << 3;
-      restore_coding_context(cpi);
 
       if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
     }
@@ -3505,7 +3547,22 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       ++cpi->tot_recode_hits;
 #endif
     }
+
+    if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF)
+      if (loop || !enable_acl) restore_coding_context(cpi);
   } while (loop);
+
+  if (enable_acl) {
+    vp9_encode_frame(cpi);
+    vpx_clear_system_state();
+    restore_coding_context(cpi);
+    vp9_pack_bitstream(cpi, dest, size);
+
+    vp9_encode_frame(cpi);
+    vpx_clear_system_state();
+
+    restore_coding_context(cpi);
+  }
 }
 
 static int get_ref_frame_flags(const VP9_COMP *cpi) {
@@ -4288,6 +4345,26 @@ static void adjust_image_stat(double y, double u, double v, double all,
 }
 #endif  // CONFIG_INTERNAL_STATS
 
+// Adjust the maximum allowable frame size for the target level.
+static void level_rc_framerate(VP9_COMP *cpi, int arf_src_index) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  LevelConstraint *const ls = &cpi->level_constraint;
+  VP9_COMMON *const cm = &cpi->common;
+  const double max_cpb_size = ls->max_cpb_size;
+  vpx_clear_system_state();
+  rc->max_frame_bandwidth = VPXMIN(rc->max_frame_bandwidth, ls->max_frame_size);
+  if (frame_is_intra_only(cm)) {
+    rc->max_frame_bandwidth =
+        VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.5));
+  } else if (arf_src_index > 0) {
+    rc->max_frame_bandwidth =
+        VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.4));
+  } else {
+    rc->max_frame_bandwidth =
+        VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.2));
+  }
+}
+
 static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   VP9_COMMON *const cm = &cpi->common;
   Vp9LevelInfo *const level_info = &cpi->level_info;
@@ -4296,6 +4373,8 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   int i, idx;
   uint64_t luma_samples, dur_end;
   const uint32_t luma_pic_size = cm->width * cm->height;
+  LevelConstraint *const level_constraint = &cpi->level_constraint;
+  const int8_t level_index = level_constraint->level_index;
   double cpb_data_size;
 
   vpx_clear_system_state();
@@ -4406,6 +4485,78 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   if (level_spec->max_col_tiles < (1 << cm->log2_tile_cols)) {
     level_spec->max_col_tiles = (1 << cm->log2_tile_cols);
   }
+
+  if (level_index >= 0 && level_constraint->fail_flag == 0) {
+    if (level_spec->max_luma_picture_size >
+        vp9_level_defs[level_index].max_luma_picture_size) {
+      level_constraint->fail_flag |= (1 << LUMA_PIC_SIZE_TOO_LARGE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]);
+    }
+
+    if ((double)level_spec->max_luma_sample_rate >
+        (double)vp9_level_defs[level_index].max_luma_sample_rate *
+            (1 + SAMPLE_RATE_GRACE_P)) {
+      level_constraint->fail_flag |= (1 << LUMA_SAMPLE_RATE_TOO_LARGE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[LUMA_SAMPLE_RATE_TOO_LARGE]);
+    }
+
+    if (level_spec->max_col_tiles > vp9_level_defs[level_index].max_col_tiles) {
+      level_constraint->fail_flag |= (1 << TOO_MANY_COLUMN_TILE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[TOO_MANY_COLUMN_TILE]);
+    }
+
+    if (level_spec->min_altref_distance <
+        vp9_level_defs[level_index].min_altref_distance) {
+      level_constraint->fail_flag |= (1 << ALTREF_DIST_TOO_SMALL);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[ALTREF_DIST_TOO_SMALL]);
+    }
+
+    if (level_spec->max_ref_frame_buffers >
+        vp9_level_defs[level_index].max_ref_frame_buffers) {
+      level_constraint->fail_flag |= (1 << TOO_MANY_REF_BUFFER);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[TOO_MANY_REF_BUFFER]);
+    }
+
+    if (level_spec->max_cpb_size > vp9_level_defs[level_index].max_cpb_size) {
+      level_constraint->fail_flag |= (1 << CPB_TOO_LARGE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[CPB_TOO_LARGE]);
+    }
+
+    // Set an upper bound for the next frame size. It will be used in
+    // level_rc_framerate() before encoding the next frame.
+    cpb_data_size = 0;
+    for (i = 0; i < CPB_WINDOW_SIZE - 1; ++i) {
+      if (i >= level_stats->frame_window_buffer.len) break;
+      idx = (level_stats->frame_window_buffer.start +
+             level_stats->frame_window_buffer.len - 1 - i) %
+            FRAME_WINDOW_SIZE;
+      cpb_data_size += level_stats->frame_window_buffer.buf[idx].size;
+    }
+    cpb_data_size = cpb_data_size / 125.0;
+    level_constraint->max_frame_size =
+        (int)((vp9_level_defs[level_index].max_cpb_size - cpb_data_size) *
+              1000.0);
+    if (level_stats->frame_window_buffer.len < CPB_WINDOW_SIZE - 1)
+      level_constraint->max_frame_size >>= 1;
+  }
 }
 
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
@@ -4633,6 +4784,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     set_frame_size(cpi);
   }
 
+  if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 &&
+      cpi->level_constraint.fail_flag == 0)
+    level_rc_framerate(cpi, arf_src_index);
+
   if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) {
     for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
index 0007e6395da..de324d3aab9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
@@ -237,7 +237,7 @@ typedef struct VP9EncoderConfig {
 
   int max_threads;
 
-  int target_level;
+  unsigned int target_level;
 
   vpx_fixed_buf_t two_pass_stats_in;
   struct vpx_codec_pkt_list *output_pkt_list;
@@ -341,6 +341,8 @@ typedef struct {
   uint8_t max_ref_frame_buffers;
 } Vp9LevelSpec;
 
+extern const Vp9LevelSpec vp9_level_defs[VP9_LEVELS];
+
 typedef struct {
   int64_t ts;  // timestamp
   uint32_t luma_samples;
@@ -368,6 +370,26 @@ typedef struct {
   Vp9LevelSpec level_spec;
 } Vp9LevelInfo;
 
+typedef enum {
+  BITRATE_TOO_LARGE = 0,
+  LUMA_PIC_SIZE_TOO_LARGE = 1,
+  LUMA_SAMPLE_RATE_TOO_LARGE = 2,
+  CPB_TOO_LARGE = 3,
+  COMPRESSION_RATIO_TOO_SMALL = 4,
+  TOO_MANY_COLUMN_TILE = 5,
+  ALTREF_DIST_TOO_SMALL = 6,
+  TOO_MANY_REF_BUFFER = 7,
+  TARGET_LEVEL_FAIL_IDS = 8
+} TARGET_LEVEL_FAIL_ID;
+
+typedef struct {
+  int8_t level_index;
+  uint8_t rc_config_updated;
+  uint8_t fail_flag;
+  int max_frame_size;   // in bits
+  double max_cpb_size;  // in bits
+} LevelConstraint;
+
 typedef struct VP9_COMP {
   QUANTS quants;
   ThreadData td;
@@ -594,6 +616,8 @@ typedef struct VP9_COMP {
   int64_t vbp_thresholds[4];
   int64_t vbp_threshold_minmax;
   int64_t vbp_threshold_sad;
+  // Threshold used for partition copy
+  int64_t vbp_threshold_copy;
   BLOCK_SIZE vbp_bsize_min;
 
   // Multi-threading
@@ -605,6 +629,12 @@ typedef struct VP9_COMP {
 
   int keep_level_stats;
   Vp9LevelInfo level_info;
+
+  // Previous Partition Info
+  BLOCK_SIZE *prev_partition;
+  int8_t *prev_segment_id;
+
+  LevelConstraint level_constraint;
 } VP9_COMP;
 
 void vp9_initialize_enc(void);
@@ -760,6 +790,14 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
   return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
 }
 
+static INLINE int get_level_index(VP9_LEVEL level) {
+  int i;
+  for (i = 0; i < VP9_LEVELS; ++i) {
+    if (level == vp9_level_defs[i].level) return i;
+  }
+  return -1;
+}
+
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
index 788952d3467..72e9ac77e78 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
@@ -117,8 +117,7 @@ static void output_stats(FIRSTPASS_STATS *stats,
             stats->intra_skip_pct, stats->intra_smooth_pct,
             stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
             stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
-            stats->MVcv, stats->mv_in_out_count, stats->new_mv_count,
-            stats->count, stats->duration);
+            stats->MVcv, stats->mv_in_out_count, stats->count, stats->duration);
     fclose(fpfile);
   }
 #endif
@@ -157,7 +156,6 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->MVrv = 0.0;
   section->MVcv = 0.0;
   section->mv_in_out_count = 0.0;
-  section->new_mv_count = 0.0;
   section->count = 0.0;
   section->duration = 1.0;
   section->spatial_layer_id = 0;
@@ -187,7 +185,6 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->MVrv += frame->MVrv;
   section->MVcv += frame->MVcv;
   section->mv_in_out_count += frame->mv_in_out_count;
-  section->new_mv_count += frame->new_mv_count;
   section->count += frame->count;
   section->duration += frame->duration;
 }
@@ -215,7 +212,6 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->MVrv -= frame->MVrv;
   section->MVcv -= frame->MVcv;
   section->mv_in_out_count -= frame->mv_in_out_count;
-  section->new_mv_count -= frame->new_mv_count;
   section->count -= frame->count;
   section->duration -= frame->duration;
 }
@@ -679,9 +675,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   int intra_skip_count = 0;
   int intra_smooth_count = 0;
   int image_data_start_row = INVALID_ROW;
-  int new_mv_count = 0;
   int sum_in_vectors = 0;
-  MV lastmv = { 0, 0 };
   TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = { 0, 0 };
   int recon_y_stride, recon_uv_stride, uv_mb_height;
@@ -1144,10 +1138,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
             }
 #endif
 
-            // Non-zero vector, was it different from the last non zero vector?
-            if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count;
-            lastmv = mv;
-
             // Does the row vector point inwards or outwards?
             if (mb_row < cm->mb_rows / 2) {
               if (mv.row > 0)
@@ -1263,7 +1253,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       fps.MVcv =
           ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
       fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
-      fps.new_mv_count = new_mv_count;
       fps.pcnt_motion = (double)mvcount / num_mbs;
     } else {
       fps.MVr = 0.0;
@@ -1273,7 +1262,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       fps.MVrv = 0.0;
       fps.MVcv = 0.0;
       fps.mv_in_out_count = 0.0;
-      fps.new_mv_count = 0.0;
       fps.pcnt_motion = 0.0;
     }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
index 6aa39cdc004..5541893dc89 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
@@ -61,7 +61,6 @@ typedef struct {
   double MVrv;
   double MVcv;
   double mv_in_out_count;
-  double new_mv_count;
   double duration;
   double count;
   int64_t spatial_layer_id;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
index 2d9bcbda679..70deda84211 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
@@ -277,7 +277,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   const uint8_t *const z = x->plane[0].src.buf;                            \
   const int src_stride = x->plane[0].src.stride;                           \
   const MACROBLOCKD *xd = &x->e_mbd;                                       \
-  unsigned int besterr = INT_MAX;                                          \
+  unsigned int besterr = UINT_MAX;                                         \
   unsigned int sse;                                                        \
   unsigned int whichdir;                                                   \
   int thismse;                                                             \
@@ -472,7 +472,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
 
   if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
       (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
+    return UINT_MAX;
 
   return besterr;
 }
@@ -622,7 +622,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned(
 
   if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
       (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
+    return UINT_MAX;
 
   return besterr;
 }
@@ -646,7 +646,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
   const uint8_t *const src_address = z;
   const int src_stride = x->plane[0].src.stride;
   const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
+  unsigned int besterr = UINT_MAX;
   unsigned int sse;
   int thismse;
   const int y_stride = xd->plane[0].pre[0].stride;
@@ -708,7 +708,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
           *sse1 = sse;
         }
       } else {
-        cost_array[idx] = INT_MAX;
+        cost_array[idx] = UINT_MAX;
       }
     }
 
@@ -737,7 +737,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
         *sse1 = sse;
       }
     } else {
-      cost_array[idx] = INT_MAX;
+      cost_array[idx] = UINT_MAX;
     }
 
     if (best_idx < 4 && best_idx >= 0) {
@@ -771,7 +771,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
 
   if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
       (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
+    return UINT_MAX;
 
   return besterr;
 }
@@ -2318,11 +2318,14 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
+  unsigned int best_sad = INT_MAX;
+  int i, j;
+  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  best_sad =
       fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
                    in_what->stride, second_pred) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
-  int i, j;
 
   for (i = 0; i < search_range; ++i) {
     int best_site = -1;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c
index 0e5d8ade4ae..2252fe16b9d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c
@@ -26,21 +26,23 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
   ne->level = kLowLow;
   ne->value = 0;
   ne->count = 0;
-  ne->thresh = 90;
+  ne->thresh = 100;
   ne->last_w = 0;
   ne->last_h = 0;
   if (width * height >= 1920 * 1080) {
     ne->thresh = 200;
   } else if (width * height >= 1280 * 720) {
-    ne->thresh = 130;
+    ne->thresh = 140;
   }
   ne->num_frames_estimate = 20;
 }
 
 static int enable_noise_estimation(VP9_COMP *const cpi) {
-// Enable noise estimation if denoising is on.
+// Enable noise estimation if denoising is on, but not for low resolutions.
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) return 1;
+  if (cpi->oxcf.noise_sensitivity > 0 && cpi->common.width >= 640 &&
+      cpi->common.height >= 360)
+    return 1;
 #endif
   // Only allow noise estimate under certain encoding mode.
   // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
@@ -50,7 +52,7 @@ static int enable_noise_estimation(VP9_COMP *const cpi) {
       cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
       cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc &&
       cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->common.width >= 640 &&
-      cpi->common.height >= 480)
+      cpi->common.height >= 360)
     return 1;
   else
     return 0;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
index 2b7ddbcd948..33f3f5a476c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
@@ -1259,16 +1259,17 @@ static void recheck_zeromv_after_denoising(
                                         [INTER_OFFSET(ZEROMV)];
     this_rdc.dist = dist;
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
-    // Switch to ZEROMV if the rdcost for ZEROMV on denoised source
-    // is lower than best_ref mode (on original source).
+    // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source
+    // is higher than best_ref mode (on original source).
     if (this_rdc.rdcost > best_rdc->rdcost) {
       this_rdc = *best_rdc;
       mi->mode = ctx_den->best_mode;
       mi->ref_frame[0] = ctx_den->best_ref_frame;
       mi->interp_filter = ctx_den->best_pred_filter;
-      if (ctx_den->best_ref_frame == INTRA_FRAME)
+      if (ctx_den->best_ref_frame == INTRA_FRAME) {
         mi->mv[0].as_int = INVALID_MV;
-      else if (ctx_den->best_ref_frame == GOLDEN_FRAME) {
+        mi->interp_filter = SWITCHABLE_FILTERS;
+      } else if (ctx_den->best_ref_frame == GOLDEN_FRAME) {
         mi->mv[0].as_int =
             ctx_den->frame_mv[ctx_den->best_mode][ctx_den->best_ref_frame]
                 .as_int;
@@ -1395,6 +1396,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int perform_intra_pred = 1;
   int use_golden_nonzeromv = 1;
   int force_skip_low_temp_var = 0;
+  int skip_ref_find_pred[4] = { 0 };
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_PICKMODE_CTX_DEN ctx_den;
   int64_t zero_last_cost_orig = INT64_MAX;
@@ -1469,9 +1471,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     usable_ref_frame = GOLDEN_FRAME;
   }
 
-  if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
-      (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref))
-    usable_ref_frame = ALTREF_FRAME;
+  if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) {
+    if (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref)
+      usable_ref_frame = ALTREF_FRAME;
+
+    if (cpi->rc.is_src_frame_alt_ref) {
+      skip_ref_find_pred[LAST_FRAME] = 1;
+      skip_ref_find_pred[GOLDEN_FRAME] = 1;
+    }
+  }
 
   // For svc mode, on spatial_layer_id > 0: if the reference has different scale
   // constrain the inter mode to only test zero motion.
@@ -1490,6 +1498,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   if (cpi->sf.short_circuit_low_temp_var) {
     force_skip_low_temp_var =
         get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+    // If force_skip_low_temp_var is set, and for short circuit mode = 1 and 3,
+    // skip golden reference.
+    if ((cpi->sf.short_circuit_low_temp_var == 1 ||
+         cpi->sf.short_circuit_low_temp_var == 3) &&
+        force_skip_low_temp_var) {
+      usable_ref_frame = LAST_FRAME;
+    }
   }
 
   if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
@@ -1497,9 +1512,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     use_golden_nonzeromv = 0;
 
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
-    find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
-                    &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col,
-                    yv12_mb, bsize, force_skip_low_temp_var);
+    if (!skip_ref_find_pred[ref_frame]) {
+      find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
+                      &ref_frame_skip_mask, flag_list, tile_data, mi_row,
+                      mi_col, yv12_mb, bsize, force_skip_low_temp_var);
+    }
   }
 
   for (idx = 0; idx < RT_INTER_MODES; ++idx) {
@@ -1519,6 +1536,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       ref_frame = ref_mode_set_svc[idx].ref_frame;
     }
     if (ref_frame > usable_ref_frame) continue;
+    if (skip_ref_find_pred[ref_frame]) continue;
 
     if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
         this_mode != NEARESTMV) {
@@ -1558,7 +1576,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       continue;
     }
 
-    if (cpi->sf.short_circuit_low_temp_var == 2 && force_skip_low_temp_var &&
+    if (cpi->sf.short_circuit_low_temp_var >= 2 && force_skip_low_temp_var &&
         ref_frame == LAST_FRAME && this_mode == NEWMV) {
       continue;
     }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
index b5cfd5de6c6..02059a70544 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -45,7 +45,7 @@
 
 #define FRAME_OVERHEAD_BITS 200
 
-// Use this macro to turn on/off use of alt-refs in one-pass mode.
+// Use this macro to turn on/off use of alt-refs in one-pass vbr mode.
 #define USE_ALTREF_FOR_ONE_PASS 0
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -414,7 +414,7 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) {
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
         !rc->is_src_frame_alt_ref && !cpi->use_svc &&
-        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100))
       rcf = rc->rate_correction_factors[GF_ARF_STD];
     else
       rcf = rc->rate_correction_factors[INTER_NORMAL];
@@ -440,7 +440,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
         !rc->is_src_frame_alt_ref && !cpi->use_svc &&
-        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100))
       rc->rate_correction_factors[GF_ARF_STD] = factor;
     else
       rc->rate_correction_factors[INTER_NORMAL] = factor;
@@ -560,15 +560,17 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
   // In CBR mode, this makes sure q is between oscillating Qs to prevent
   // resonance.
   if (cpi->oxcf.rc_mode == VPX_CBR &&
+      (!cpi->oxcf.gf_cbr_boost_pct ||
+       !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
       (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
       cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
     q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
               VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
   }
 #if USE_ALTREF_FOR_ONE_PASS
-  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR &&
-      cpi->oxcf.lag_in_frames > 0 && cpi->rc.is_src_frame_alt_ref &&
-      !cpi->rc.alt_ref_gf_group) {
+  if (cpi->oxcf.enable_auto_arf && cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
+      cpi->rc.is_src_frame_alt_ref && !cpi->rc.alt_ref_gf_group) {
     q = VPXMIN(q, (q + cpi->rc.last_boosted_qindex) >> 1);
   }
 #endif
@@ -1528,8 +1530,14 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
     adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
-    rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
-    rc->alt_ref_gf_group = USE_ALTREF_FOR_ONE_PASS;
+    rc->source_alt_ref_pending = 0;
+    rc->alt_ref_gf_group = 0;
+#if USE_ALTREF_FOR_ONE_PASS
+    if (cpi->oxcf.enable_auto_arf) {
+      rc->source_alt_ref_pending = 1;
+      rc->alt_ref_gf_group = 1;
+    }
+#endif
   }
   if (cm->frame_type == KEY_FRAME)
     target = calc_iframe_target_size_one_pass_vbr(cpi);
@@ -2140,20 +2148,22 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
       rc->gfu_boost = DEFAULT_GF_BOOST >> 2;
     }
 #if USE_ALTREF_FOR_ONE_PASS
-    // Don't use alt-ref if there is a scene cut within the group,
-    // or content is not low.
-    if ((rc->high_source_sad_lagindex > 0 &&
-         rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) ||
-        (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) {
-      rc->source_alt_ref_pending = 0;
-      rc->alt_ref_gf_group = 0;
-    } else {
-      rc->source_alt_ref_pending = 1;
-      rc->alt_ref_gf_group = 1;
-      // If alt-ref is used for this gf group, limit the interval.
-      if (rc->baseline_gf_interval > 10 &&
-          rc->baseline_gf_interval < rc->frames_to_key)
-        rc->baseline_gf_interval = 10;
+    if (cpi->oxcf.enable_auto_arf) {
+      // Don't use alt-ref if there is a scene cut within the group,
+      // or content is not low.
+      if ((rc->high_source_sad_lagindex > 0 &&
+           rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) ||
+          (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) {
+        rc->source_alt_ref_pending = 0;
+        rc->alt_ref_gf_group = 0;
+      } else {
+        rc->source_alt_ref_pending = 1;
+        rc->alt_ref_gf_group = 1;
+        // If alt-ref is used for this gf group, limit the interval.
+        if (rc->baseline_gf_interval > 10 &&
+            rc->baseline_gf_interval < rc->frames_to_key)
+          rc->baseline_gf_interval = 10;
+      }
     }
 #endif
     target = calc_pframe_target_size_one_pass_vbr(cpi);
@@ -2243,10 +2253,12 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
         for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
           for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
             // Checker-board pattern, ignore boundary.
-            if ((sbi_row > 0 && sbi_col > 0) &&
-                (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
-                ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
-                 (sbi_row % 2 != 0 && sbi_col % 2 != 0))) {
+            // If the partition copy is on, compute for every superblock.
+            if (cpi->sf.copy_partition_flag ||
+                ((sbi_row > 0 && sbi_col > 0) &&
+                 (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
+                 ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
+                  (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) {
               num_samples++;
               avg_sad += cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
                                                 last_src_ystride);
@@ -2284,7 +2296,10 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
         cpi->ext_refresh_frame_flags_pending == 0) {
       int target;
       cpi->refresh_golden_frame = 1;
-      rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+      rc->source_alt_ref_pending = 0;
+#if USE_ALTREF_FOR_ONE_PASS
+      if (cpi->oxcf.enable_auto_arf) rc->source_alt_ref_pending = 1;
+#endif
       rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
       rc->baseline_gf_interval =
           VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval));
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
index 3e1ed50a6d2..81cb431ba58 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
@@ -182,6 +182,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->mv.subpel_iters_per_step = 1;
     sf->mode_skip_start = 10;
     sf->adaptive_pred_interp_filter = 1;
+    sf->allow_acl = 0;
 
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
@@ -309,6 +310,8 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
   sf->use_fast_coef_costing = 1;
   sf->allow_exhaustive_searches = 0;
   sf->exhaustive_searches_thresh = INT_MAX;
+  sf->allow_acl = 0;
+  sf->copy_partition_flag = 0;
 
   if (speed >= 1) {
     sf->allow_txfm_domain_distortion = 1;
@@ -494,6 +497,18 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
 
   if (speed >= 8) {
     sf->adaptive_rd_thresh = 4;
+    // Disabled for now until the threshold is tuned.
+    sf->copy_partition_flag = 0;
+    if (sf->copy_partition_flag) {
+      if (cpi->prev_partition == NULL) {
+        cpi->prev_partition = (BLOCK_SIZE *)vpx_calloc(
+            cm->mi_stride * cm->mi_rows, sizeof(BLOCK_SIZE));
+      }
+      if (cpi->prev_segment_id == NULL) {
+        cpi->prev_segment_id =
+            (int8_t *)vpx_calloc(cm->mi_stride * cm->mi_rows, sizeof(int8_t));
+      }
+    }
     sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
     if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
     // Only keep INTRA_DC mode for speed 8.
@@ -505,7 +520,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
     if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
         content != VP9E_CONTENT_SCREEN) {
       // More aggressive short circuit for speed 8.
-      sf->short_circuit_low_temp_var = 2;
+      sf->short_circuit_low_temp_var = 3;
     }
     sf->limit_newmv_early_exit = 0;
   }
@@ -592,6 +607,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->tx_domain_thresh = 99.0;
   sf->allow_quant_coeff_opt = sf->optimize_coefficients;
   sf->quant_opt_thresh = 99.0;
+  sf->allow_acl = 1;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
index 6d0b9420a1d..944fe6322fb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
@@ -244,6 +244,10 @@ typedef struct SPEED_FEATURES {
   int allow_quant_coeff_opt;
   double quant_opt_thresh;
 
+  // Enable asymptotic closed-loop encoding decision for key frame and
+  // alternate reference frames.
+  int allow_acl;
+
   // Use transform domain distortion. Use pixel domain distortion in speed 0
   // and certain situations in higher speed to improve the RD model precision.
   int allow_txfm_domain_distortion;
@@ -452,11 +456,13 @@ typedef struct SPEED_FEATURES {
   int short_circuit_flat_blocks;
 
   // Skip a number of expensive mode evaluations for blocks with very low
-  // temporal variance.
-  // 1: Skip golden non-zeromv and ALL INTRA for bsize >= 32x32.
+  // temporal variance. If the low temporal variance flag is set for a block,
+  // do the following:
+  // 1: Skip all golden modes and ALL INTRA for bsize >= 32x32.
   // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL
   // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and
   // 32x16.
+  // 3: Same as (2), but also skip golden zeromv.
   int short_circuit_low_temp_var;
 
   // Limits the rd-threshold update for early exit for the newmv-last mode,
@@ -469,6 +475,9 @@ typedef struct SPEED_FEATURES {
   // Bias to use base mv and skip 1/4 subpel search when use base mv in
   // enhancement layer.
   int base_mv_aggressive;
+
+  // Global flag to enable partition copy from the previous frame.
+  int copy_partition_flag;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
index 2d29e268b1f..1d892dc148b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -650,6 +650,21 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
                        lc->scaling_factor_num, lc->scaling_factor_den, &width,
                        &height);
 
+  // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use
+  // of base motion vectors if spatial scale factors for any layers are not 2.
+  // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2.
+  if (cpi->svc.number_spatial_layers > 1) {
+    int sl;
+    for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) {
+      lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers +
+                                   cpi->svc.temporal_layer_id];
+      if (lc->scaling_factor_num != lc->scaling_factor_den >> 1) {
+        cpi->svc.use_base_mv = 0;
+        break;
+      }
+    }
+  }
+
   if (vp9_set_size_literal(cpi, width, height) != 0)
     return VPX_CODEC_INVALID_PARAM;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
index a167eeb15de..344658483a1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -208,17 +208,17 @@ void vp9_highbd_temporal_filter_apply_c(
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
-                                              uint8_t *arf_frame_buf,
-                                              uint8_t *frame_ptr_buf,
-                                              int stride) {
+static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
+                                                   uint8_t *arf_frame_buf,
+                                                   uint8_t *frame_ptr_buf,
+                                                   int stride) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
   const SEARCH_METHODS old_search_method = mv_sf->search_method;
   int step_param;
   int sadpb = x->sadperbit16;
-  int bestsme = INT_MAX;
+  uint32_t bestsme = UINT_MAX;
   uint32_t distortion;
   uint32_t sse;
   int cost_list[5];
@@ -334,8 +334,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
           ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
 
       for (frame = 0; frame < frame_count; frame++) {
-        const int thresh_low = 10000;
-        const int thresh_high = 20000;
+        const uint32_t thresh_low = 10000;
+        const uint32_t thresh_high = 20000;
 
         if (frames[frame] == NULL) continue;
 
@@ -346,7 +346,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
           filter_weight = 2;
         } else {
           // Find best match in this frame by MC
-          int err = temporal_filter_find_matching_mb_c(
+          uint32_t err = temporal_filter_find_matching_mb_c(
               cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
               frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
index a797b2c2624..e6cea080d16 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
@@ -157,6 +157,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
   RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
   RANGE_CHECK_BOOL(extra_cfg, lossless);
+  RANGE_CHECK_BOOL(extra_cfg, frame_parallel_decoding_mode);
   RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2);
   RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1);
   RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
@@ -389,6 +390,50 @@ static int get_image_bps(const vpx_image_t *img) {
   return 0;
 }
 
+// Modify the encoder config for the target level.
+static void config_target_level(VP9EncoderConfig *oxcf) {
+  double max_average_bitrate;  // in bits per second
+  int max_over_shoot_pct;
+  const int target_level_index = get_level_index(oxcf->target_level);
+
+  vpx_clear_system_state();
+  assert(target_level_index >= 0);
+  assert(target_level_index < VP9_LEVELS);
+
+  // Maximum target bit-rate is level_limit * 80%.
+  max_average_bitrate =
+      vp9_level_defs[target_level_index].average_bitrate * 800.0;
+  if ((double)oxcf->target_bandwidth > max_average_bitrate)
+    oxcf->target_bandwidth = (int64_t)(max_average_bitrate);
+  if (oxcf->ss_number_layers == 1 && oxcf->pass != 0)
+    oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
+
+  // Adjust max over-shoot percentage.
+  max_over_shoot_pct =
+      (int)((max_average_bitrate * 1.10 - (double)oxcf->target_bandwidth) *
+            100 / (double)(oxcf->target_bandwidth));
+  if (oxcf->over_shoot_pct > max_over_shoot_pct)
+    oxcf->over_shoot_pct = max_over_shoot_pct;
+
+  // Adjust worst allowed quantizer.
+  oxcf->worst_allowed_q = vp9_quantizer_to_qindex(63);
+
+  // Adjust minimum art-ref distance.
+  if (oxcf->min_gf_interval <
+      (int)vp9_level_defs[target_level_index].min_altref_distance)
+    oxcf->min_gf_interval =
+        (int)vp9_level_defs[target_level_index].min_altref_distance;
+
+  // Adjust maximum column tiles.
+  if (vp9_level_defs[target_level_index].max_col_tiles <
+      (1 << oxcf->tile_columns)) {
+    while (oxcf->tile_columns > 0 &&
+           vp9_level_defs[target_level_index].max_col_tiles <
+               (1 << oxcf->tile_columns))
+      --oxcf->tile_columns;
+  }
+}
+
 static vpx_codec_err_t set_encoder_config(
     VP9EncoderConfig *oxcf, const vpx_codec_enc_cfg_t *cfg,
     const struct vp9_extracfg *extra_cfg) {
@@ -532,6 +577,8 @@ static vpx_codec_err_t set_encoder_config(
   } else if (oxcf->ts_number_layers == 1) {
     oxcf->ts_rate_decimator[0] = 1;
   }
+
+  if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf);
   /*
   printf("Current VP9 Settings: \n");
   printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
@@ -1002,6 +1049,28 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
   if (cpi == NULL) return VPX_CODEC_INVALID_PARAM;
 
+  if (cpi->oxcf.pass == 2 && cpi->level_constraint.level_index >= 0 &&
+      !cpi->level_constraint.rc_config_updated) {
+    SVC *const svc = &cpi->svc;
+    const int is_two_pass_svc =
+        (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1);
+    const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+    TWO_PASS *const twopass = &cpi->twopass;
+    FIRSTPASS_STATS *stats = &twopass->total_stats;
+    if (is_two_pass_svc) {
+      const double frame_rate = 10000000.0 * stats->count / stats->duration;
+      vp9_update_spatial_layer_framerate(cpi, frame_rate);
+      twopass->bits_left =
+          (int64_t)(stats->duration *
+                    svc->layer_context[svc->spatial_layer_id].target_bandwidth /
+                    10000000.0);
+    } else {
+      twopass->bits_left =
+          (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+    }
+    cpi->level_constraint.rc_config_updated = 1;
+  }
+
   if (img != NULL) {
     res = validate_img(ctx, img);
     if (res == VPX_CODEC_OK) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
index 88b1531d8c4..c2f80d88515 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
@@ -201,7 +201,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
   char *input_string;
   char *option_name;
   char *option_value;
-  char *input_ptr;
+  char *input_ptr = NULL;
   SvcInternal_t *const si = get_svc_internal(svc_ctx);
   vpx_codec_err_t res = VPX_CODEC_OK;
   int i, alt_ref_enabled = 0;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c
new file mode 100644
index 00000000000..1fb41d29920
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c
@@ -0,0 +1,485 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+extern const int16_t vpx_rv[];
+
+static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1,
+                               const uint8x8_t v0, const uint8x8_t b1,
+                               const uint8x8_t b2) {
+  const uint8x8_t k1 = vrhadd_u8(a2, a1);
+  const uint8x8_t k2 = vrhadd_u8(b2, b1);
+  const uint8x8_t k3 = vrhadd_u8(k1, k2);
+  return vrhadd_u8(k3, v0);
+}
+
+static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1,
+                               const uint8x8_t v0, const uint8x8_t b1,
+                               const uint8x8_t b2, const uint8x8_t filter) {
+  const uint8x8_t a2_v0 = vabd_u8(a2, v0);
+  const uint8x8_t a1_v0 = vabd_u8(a1, v0);
+  const uint8x8_t b1_v0 = vabd_u8(b1, v0);
+  const uint8x8_t b2_v0 = vabd_u8(b2, v0);
+
+  uint8x8_t max = vmax_u8(a2_v0, a1_v0);
+  max = vmax_u8(b1_v0, max);
+  max = vmax_u8(b2_v0, max);
+  return vclt_u8(max, filter);
+}
+
+static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1,
+                                 const uint8x8_t v0, const uint8x8_t b1,
+                                 const uint8x8_t b2, const uint8x8_t filter) {
+  const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2);
+  const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter);
+
+  return vbsl_u8(mask, k_out, v0);
+}
+
+// Same functions but for uint8x16_t.
+static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1,
+                                 const uint8x16_t v0, const uint8x16_t b1,
+                                 const uint8x16_t b2) {
+  const uint8x16_t k1 = vrhaddq_u8(a2, a1);
+  const uint8x16_t k2 = vrhaddq_u8(b2, b1);
+  const uint8x16_t k3 = vrhaddq_u8(k1, k2);
+  return vrhaddq_u8(k3, v0);
+}
+
+static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1,
+                                 const uint8x16_t v0, const uint8x16_t b1,
+                                 const uint8x16_t b2, const uint8x16_t filter) {
+  const uint8x16_t a2_v0 = vabdq_u8(a2, v0);
+  const uint8x16_t a1_v0 = vabdq_u8(a1, v0);
+  const uint8x16_t b1_v0 = vabdq_u8(b1, v0);
+  const uint8x16_t b2_v0 = vabdq_u8(b2, v0);
+
+  uint8x16_t max = vmaxq_u8(a2_v0, a1_v0);
+  max = vmaxq_u8(b1_v0, max);
+  max = vmaxq_u8(b2_v0, max);
+  return vcltq_u8(max, filter);
+}
+
+static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1,
+                                   const uint8x16_t v0, const uint8x16_t b1,
+                                   const uint8x16_t b2,
+                                   const uint8x16_t filter) {
+  const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2);
+  const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter);
+
+  return vbslq_u8(mask, k_out, v0);
+}
+
+void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int dst_stride, int cols,
+                                               uint8_t *f, int size) {
+  uint8_t *src, *dst;
+  int row;
+  int col;
+
+  // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for
+  // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8
+  // (for U/V).
+  assert((size == 8 || size == 16) && cols % 8 == 0);
+
+  // While columns of length 16 can be processed, load them.
+  for (col = 0; col < cols - 8; col += 16) {
+    uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7;
+    src = src_ptr - 2 * src_stride;
+    dst = dst_ptr;
+
+    a0 = vld1q_u8(src);
+    src += src_stride;
+    a1 = vld1q_u8(src);
+    src += src_stride;
+    a2 = vld1q_u8(src);
+    src += src_stride;
+    a3 = vld1q_u8(src);
+    src += src_stride;
+
+    for (row = 0; row < size; row += 4) {
+      uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3;
+      const uint8x16_t filterq = vld1q_u8(f + col);
+
+      a4 = vld1q_u8(src);
+      src += src_stride;
+      a5 = vld1q_u8(src);
+      src += src_stride;
+      a6 = vld1q_u8(src);
+      src += src_stride;
+      a7 = vld1q_u8(src);
+      src += src_stride;
+
+      v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq);
+      v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq);
+      v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq);
+      v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq);
+
+      vst1q_u8(dst, v_out_0);
+      dst += dst_stride;
+      vst1q_u8(dst, v_out_1);
+      dst += dst_stride;
+      vst1q_u8(dst, v_out_2);
+      dst += dst_stride;
+      vst1q_u8(dst, v_out_3);
+      dst += dst_stride;
+
+      // Rotate over to the next slot.
+      a0 = a4;
+      a1 = a5;
+      a2 = a6;
+      a3 = a7;
+    }
+
+    src_ptr += 16;
+    dst_ptr += 16;
+  }
+
+  // Clean up any left over column of length 8.
+  if (col != cols) {
+    uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
+    src = src_ptr - 2 * src_stride;
+    dst = dst_ptr;
+
+    a0 = vld1_u8(src);
+    src += src_stride;
+    a1 = vld1_u8(src);
+    src += src_stride;
+    a2 = vld1_u8(src);
+    src += src_stride;
+    a3 = vld1_u8(src);
+    src += src_stride;
+
+    for (row = 0; row < size; row += 4) {
+      uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3;
+      const uint8x8_t filter = vld1_u8(f + col);
+
+      a4 = vld1_u8(src);
+      src += src_stride;
+      a5 = vld1_u8(src);
+      src += src_stride;
+      a6 = vld1_u8(src);
+      src += src_stride;
+      a7 = vld1_u8(src);
+      src += src_stride;
+
+      v_out_0 = generate_output(a0, a1, a2, a3, a4, filter);
+      v_out_1 = generate_output(a1, a2, a3, a4, a5, filter);
+      v_out_2 = generate_output(a2, a3, a4, a5, a6, filter);
+      v_out_3 = generate_output(a3, a4, a5, a6, a7, filter);
+
+      vst1_u8(dst, v_out_0);
+      dst += dst_stride;
+      vst1_u8(dst, v_out_1);
+      dst += dst_stride;
+      vst1_u8(dst, v_out_2);
+      dst += dst_stride;
+      vst1_u8(dst, v_out_3);
+      dst += dst_stride;
+
+      // Rotate over to the next slot.
+      a0 = a4;
+      a1 = a5;
+      a2 = a6;
+      a3 = a7;
+    }
+
+    // Not strictly necessary but makes resetting dst_ptr easier.
+    dst_ptr += 8;
+  }
+
+  dst_ptr -= cols;
+
+  for (row = 0; row < size; row += 8) {
+    uint8x8_t a0, a1, a2, a3;
+    uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+    src = dst_ptr;
+    dst = dst_ptr;
+
+    // Load 8 values, transpose 4 of them, and discard 2 because they will be
+    // reloaded later.
+    load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3);
+    a3 = a1;
+    a2 = a1 = a0;  // Extend left border.
+
+    src += 2;
+
+    for (col = 0; col < cols; col += 8) {
+      uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6,
+          v_out_7;
+      // Although the filter is meant to be applied vertically and is instead
+      // being applied horizontally here it's OK because it's set in blocks of 8
+      // (or 16).
+      const uint8x8_t filter = vld1_u8(f + col);
+
+      load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5,
+                                &b6, &b7);
+
+      if (col + 8 == cols) {
+        // Last row. Extend border (b5).
+        b6 = b7 = b5;
+      }
+
+      v_out_0 = generate_output(a0, a1, a2, a3, b0, filter);
+      v_out_1 = generate_output(a1, a2, a3, b0, b1, filter);
+      v_out_2 = generate_output(a2, a3, b0, b1, b2, filter);
+      v_out_3 = generate_output(a3, b0, b1, b2, b3, filter);
+      v_out_4 = generate_output(b0, b1, b2, b3, b4, filter);
+      v_out_5 = generate_output(b1, b2, b3, b4, b5, filter);
+      v_out_6 = generate_output(b2, b3, b4, b5, b6, filter);
+      v_out_7 = generate_output(b3, b4, b5, b6, b7, filter);
+
+      transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2,
+                                 v_out_3, v_out_4, v_out_5, v_out_6, v_out_7);
+
+      a0 = b4;
+      a1 = b5;
+      a2 = b6;
+      a3 = b7;
+
+      src += 8;
+      dst += 8;
+    }
+
+    dst_ptr += 8 * dst_stride;
+  }
+}
+
+// sum += x;
+// sumsq += x * y;
+static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy,
+                                 int16x4_t *const sum, int32x4_t *const sumsq) {
+  const int16x4_t zero = vdup_n_s16(0);
+  const int32x4_t zeroq = vdupq_n_s32(0);
+
+  // Add in the first set because vext doesn't work with '0'.
+  *sum = vadd_s16(*sum, x);
+  *sumsq = vaddq_s32(*sumsq, xy);
+
+  // Shift x and xy to the right and sum. vext requires an immediate.
+  *sum = vadd_s16(*sum, vext_s16(zero, x, 1));
+  *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1));
+
+  *sum = vadd_s16(*sum, vext_s16(zero, x, 2));
+  *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2));
+
+  *sum = vadd_s16(*sum, vext_s16(zero, x, 3));
+  *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3));
+}
+
+// Generate mask based on (sumsq * 15 - sum * sum < flimit)
+static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq,
+                                 const int32x4_t f, const int32x4_t fifteen) {
+  const int32x4_t a = vmulq_s32(sumsq, fifteen);
+  const int32x4_t b = vmlsl_s16(a, sum, sum);
+  const uint32x4_t mask32 = vcltq_s32(b, f);
+  return vmovn_u32(mask32);
+}
+
+static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high,
+                              const int32x4_t sumsq_low,
+                              const int32x4_t sumsq_high, const int32x4_t f) {
+  const int32x4_t fifteen = vdupq_n_s32(15);
+  const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen);
+  const uint16x4_t mask16_high =
+      calculate_mask(sum_high, sumsq_high, f, fifteen);
+  return vmovn_u16(vcombine_u16(mask16_low, mask16_high));
+}
+
+// Apply filter of (8 + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) {
+  const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+  const int16x8_t sum_s = vaddq_s16(sum, s16);
+
+  return vqrshrun_n_s16(sum_s, 4);
+}
+
+void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols,
+                                    int flimit) {
+  int row, col;
+  const int32x4_t f = vdupq_n_s32(flimit);
+
+  assert(cols % 8 == 0);
+
+  for (row = 0; row < rows; ++row) {
+    // Sum the first 8 elements, which are extended from s[0].
+    // sumsq gets primed with +16.
+    int sumsq = src[0] * src[0] * 9 + 16;
+    int sum = src[0] * 9;
+
+    uint8x8_t left_context, s, right_context;
+    int16x4_t sum_low, sum_high;
+    int32x4_t sumsq_low, sumsq_high;
+
+    // Sum (+square) the next 6 elements.
+    // Skip [0] because it's included above.
+    for (col = 1; col <= 6; ++col) {
+      sumsq += src[col] * src[col];
+      sum += src[col];
+    }
+
+    // Prime the sums. Later the loop uses the _high values to prime the new
+    // vectors.
+    sumsq_high = vdupq_n_s32(sumsq);
+    sum_high = vdup_n_s16(sum);
+
+    // Manually extend the left border.
+    left_context = vdup_n_u8(src[0]);
+
+    for (col = 0; col < cols; col += 8) {
+      uint8x8_t mask, output;
+      int16x8_t x, y;
+      int32x4_t xy_low, xy_high;
+
+      s = vld1_u8(src + col);
+
+      if (col + 8 == cols) {
+        // Last row. Extend border.
+        right_context = vdup_n_u8(src[col + 7]);
+      } else {
+        right_context = vld1_u8(src + col + 7);
+      }
+
+      x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context));
+      y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context));
+      xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+      xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+      // Catch up to the last sum'd value.
+      sum_low = vdup_lane_s16(sum_high, 3);
+      sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1);
+
+      accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low);
+
+      // Need to do this sequentially because we need the max value from
+      // sum_low.
+      sum_high = vdup_lane_s16(sum_low, 3);
+      sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1);
+
+      accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high);
+
+      mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f);
+
+      output = filter_pixels(vcombine_s16(sum_low, sum_high), s);
+      output = vbsl_u8(mask, output, s);
+
+      vst1_u8(src + col, output);
+
+      left_context = s;
+    }
+
+    src += pitch;
+  }
+}
+
+// Apply filter of (vpx_rv + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s,
+                                  const int16x8_t rv) {
+  const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+  const int16x8_t sum_s = vaddq_s16(sum, s16);
+  const int16x8_t rounded = vaddq_s16(sum_s, rv);
+
+  return vqshrun_n_s16(rounded, 4);
+}
+
+void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols,
+                               int flimit) {
+  int row, col, i;
+  const int32x4_t f = vdupq_n_s32(flimit);
+  uint8x8_t below_context = vdup_n_u8(0);
+
+  // 8 columns are processed at a time.
+  // If rows is less than 8 the bottom border extension fails.
+  assert(cols % 8 == 0);
+  assert(rows >= 8);
+
+  // Load and keep the first 8 values in memory. Process a vertical stripe that
+  // is 8 wide.
+  for (col = 0; col < cols; col += 8) {
+    uint8x8_t s, above_context[8];
+    int16x8_t sum, sum_tmp;
+    int32x4_t sumsq_low, sumsq_high;
+
+    // Load and extend the top border.
+    s = vld1_u8(dst);
+    for (i = 0; i < 8; i++) {
+      above_context[i] = s;
+    }
+
+    sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s));
+
+    // sum * 9
+    sum = vmulq_n_s16(sum_tmp, 9);
+
+    // (sum * 9) * sum == sum * sum * 9
+    sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp));
+    sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp));
+
+    // Load and discard the next 6 values to prime sum and sumsq.
+    for (i = 1; i <= 6; ++i) {
+      const uint8x8_t a = vld1_u8(dst + i * pitch);
+      const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a));
+      sum = vaddq_s16(sum, b);
+
+      sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b));
+      sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b));
+    }
+
+    for (row = 0; row < rows; ++row) {
+      uint8x8_t mask, output;
+      int16x8_t x, y;
+      int32x4_t xy_low, xy_high;
+
+      s = vld1_u8(dst + row * pitch);
+
+      // Extend the bottom border.
+      if (row + 7 < rows) {
+        below_context = vld1_u8(dst + (row + 7) * pitch);
+      }
+
+      x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0]));
+      y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0]));
+      xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+      xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+      sum = vaddq_s16(sum, x);
+
+      sumsq_low = vaddq_s32(sumsq_low, xy_low);
+      sumsq_high = vaddq_s32(sumsq_high, xy_high);
+
+      mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low,
+                          sumsq_high, f);
+
+      output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127)));
+      output = vbsl_u8(mask, output, s);
+
+      vst1_u8(dst + row * pitch, output);
+
+      above_context[0] = above_context[1];
+      above_context[1] = above_context[2];
+      above_context[2] = above_context[3];
+      above_context[3] = above_context[4];
+      above_context[4] = above_context[5];
+      above_context[5] = above_context[6];
+      above_context[6] = above_context[7];
+      above_context[7] = s;
+    }
+
+    dst += 8;
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
new file mode 100644
index 00000000000..26fa3e216bb
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -0,0 +1,171 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
+                                                const int stride,
+                                                const int16x8_t res,
+                                                const int16x8_t max) {
+  const uint16x4_t a0 = vld1_u16(*dest);
+  const uint16x4_t a1 = vld1_u16(*dest + stride);
+  const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
+  // Note: In some profile tests, res is quite close to +/-32767.
+  // We use saturating addition.
+  const int16x8_t b = vqaddq_s16(res, a);
+  const int16x8_t c = vminq_s16(b, max);
+  const uint16x8_t d = vqshluq_n_s16(c, 0);
+  vst1_u16(*dest, vget_low_u16(d));
+  *dest += stride;
+  vst1_u16(*dest, vget_high_u16(d));
+  *dest += stride;
+}
+
+// res is in reverse row order
+static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
+                                                const int stride,
+                                                const int16x8_t res,
+                                                const int16x8_t max) {
+  const uint16x4_t a0 = vld1_u16(*dest);
+  const uint16x4_t a1 = vld1_u16(*dest + stride);
+  const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0));
+  // Note: In some profile tests, res is quite close to +/-32767.
+  // We use saturating addition.
+  const int16x8_t b = vqaddq_s16(res, a);
+  const int16x8_t c = vminq_s16(b, max);
+  const uint16x8_t d = vqshluq_n_s16(c, 0);
+  vst1_u16(*dest, vget_high_u16(d));
+  *dest += stride;
+  vst1_u16(*dest, vget_low_u16(d));
+  *dest += stride;
+}
+
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  const tran_low_t out0 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  const tran_low_t out1 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
+  highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
+}
+
+static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
+                                          int32x4_t *const a0,
+                                          int32x4_t *const a1,
+                                          int32x4_t *const a2,
+                                          int32x4_t *const a3) {
+  int32x4_t b0, b1, b2, b3;
+
+  transpose_s32_4x4(a0, a1, a2, a3);
+  b0 = vaddq_s32(*a0, *a2);
+  b1 = vsubq_s32(*a0, *a2);
+  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
+  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
+  b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1);
+  b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
+  b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
+  b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
+  b0 = vrshrq_n_s32(b0, 14);
+  b1 = vrshrq_n_s32(b1, 14);
+  b2 = vrshrq_n_s32(b2, 14);
+  b3 = vrshrq_n_s32(b3, 14);
+  *a0 = vaddq_s32(b0, b3);
+  *a1 = vaddq_s32(b1, b2);
+  *a2 = vsubq_s32(b1, b2);
+  *a3 = vsubq_s32(b0, b3);
+}
+
+static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
+                                          int32x4_t *const a0,
+                                          int32x4_t *const a1,
+                                          int32x4_t *const a2,
+                                          int32x4_t *const a3) {
+  int32x4_t b0, b1, b2, b3;
+  int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11;
+
+  transpose_s32_4x4(a0, a1, a2, a3);
+  b0 = vaddq_s32(*a0, *a2);
+  b1 = vsubq_s32(*a0, *a2);
+  c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
+  c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
+  c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
+  c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
+  c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1);
+  c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1);
+  c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1);
+  c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1);
+  c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1);
+  c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1);
+  c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1);
+  c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1);
+  c4 = vsubq_s64(c4, c8);
+  c5 = vsubq_s64(c5, c9);
+  c6 = vaddq_s64(c6, c10);
+  c7 = vaddq_s64(c7, c11);
+  b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14));
+  b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14));
+  b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14));
+  b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14));
+  *a0 = vaddq_s32(b0, b3);
+  *a1 = vaddq_s32(b1, b2);
+  *a2 = vsubq_s32(b1, b2);
+  *a3 = vsubq_s32(b0, b3);
+}
+
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  int32x4_t c0 = vld1q_s32(input);
+  int32x4_t c1 = vld1q_s32(input + 4);
+  int32x4_t c2 = vld1q_s32(input + 8);
+  int32x4_t c3 = vld1q_s32(input + 12);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int16x8_t a0, a1;
+
+  if (bd == 8) {
+    const int16x4_t cospis = vld1_s16(kCospi);
+
+    // Rows
+    a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1));
+    a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3));
+    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+
+    // Columns
+    a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
+    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+    a0 = vrshrq_n_s16(a0, 4);
+    a1 = vrshrq_n_s16(a1, 4);
+  } else {
+    const int32x4_t cospis = vld1q_s32(kCospi32);
+
+    if (bd == 10) {
+      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
+    } else {
+      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
+    }
+    a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4));
+    a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4));
+  }
+
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max);
+  highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
new file mode 100644
index 00000000000..c1c0f645d18
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -0,0 +1,614 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest,
+                                               const int stride,
+                                               const int16x8_t res,
+                                               const int16x8_t max) {
+  const uint16x8_t a = vld1q_u16(*dest);
+  const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+  const int16x8_t c = vminq_s16(b, max);
+  const uint16x8_t d = vqshluq_n_s16(c, 0);
+  vst1q_u16(*dest, d);
+  *dest += stride;
+}
+
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  const tran_low_t out0 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  const tran_low_t out1 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+}
+
+static INLINE void idct8x8_12_half1d_bd10(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x4_t step1[8], step2[8];
+
+  transpose_s32_4x4(io0, io1, io2, io3);
+
+  // stage 1
+  step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+  step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+  step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+  step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+  step1[4] = vrshrq_n_s32(step1[4], 14);
+  step1[5] = vrshrq_n_s32(step1[5], 14);
+  step1[6] = vrshrq_n_s32(step1[6], 14);
+  step1[7] = vrshrq_n_s32(step1[7], 14);
+
+  // stage 2
+  step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+  step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+  step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+  step2[1] = vrshrq_n_s32(step2[1], 14);
+  step2[2] = vrshrq_n_s32(step2[2], 14);
+  step2[3] = vrshrq_n_s32(step2[3], 14);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[1], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[1], step2[3]);
+
+  step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+  step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[5] = vrshrq_n_s32(step1[5], 14);
+  step1[6] = vrshrq_n_s32(step1[6], 14);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_half1d_bd12(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x2_t input_1l, input_1h, input_3l, input_3h;
+  int32x2_t step1l[2], step1h[2];
+  int32x4_t step1[8], step2[8];
+  int64x2_t t64[8];
+  int32x2_t t32[8];
+
+  transpose_s32_4x4(io0, io1, io2, io3);
+
+  // stage 1
+  input_1l = vget_low_s32(*io1);
+  input_1h = vget_high_s32(*io1);
+  input_3l = vget_low_s32(*io3);
+  input_3h = vget_high_s32(*io3);
+  step1l[0] = vget_low_s32(*io0);
+  step1h[0] = vget_high_s32(*io0);
+  step1l[1] = vget_low_s32(*io2);
+  step1h[1] = vget_high_s32(*io2);
+
+  t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
+  t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
+  t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
+  t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
+  t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
+  t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
+  t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
+  t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
+  t32[0] = vrshrn_n_s64(t64[0], 14);
+  t32[1] = vrshrn_n_s64(t64[1], 14);
+  t32[2] = vrshrn_n_s64(t64[2], 14);
+  t32[3] = vrshrn_n_s64(t64[3], 14);
+  t32[4] = vrshrn_n_s64(t64[4], 14);
+  t32[5] = vrshrn_n_s64(t64[5], 14);
+  t32[6] = vrshrn_n_s64(t64[6], 14);
+  t32[7] = vrshrn_n_s64(t64[7], 14);
+  step1[4] = vcombine_s32(t32[0], t32[1]);
+  step1[5] = vcombine_s32(t32[2], t32[3]);
+  step1[6] = vcombine_s32(t32[4], t32[5]);
+  step1[7] = vcombine_s32(t32[6], t32[7]);
+
+  // stage 2
+  t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+  t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+  t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+  t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+  t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+  t32[2] = vrshrn_n_s64(t64[2], 14);
+  t32[3] = vrshrn_n_s64(t64[3], 14);
+  t32[4] = vrshrn_n_s64(t64[4], 14);
+  t32[5] = vrshrn_n_s64(t64[5], 14);
+  t32[6] = vrshrn_n_s64(t64[6], 14);
+  t32[7] = vrshrn_n_s64(t64[7], 14);
+  step2[1] = vcombine_s32(t32[2], t32[3]);
+  step2[2] = vcombine_s32(t32[4], t32[5]);
+  step2[3] = vcombine_s32(t32[6], t32[7]);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[1], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[1], step2[3]);
+
+  t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[0] =
+      vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t64[2] =
+      vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t32[0] = vrshrn_n_s64(t64[0], 14);
+  t32[1] = vrshrn_n_s64(t64[1], 14);
+  t32[2] = vrshrn_n_s64(t64[2], 14);
+  t32[3] = vrshrn_n_s64(t64[3], 14);
+  step1[5] = vcombine_s32(t32[0], t32[1]);
+  step1[6] = vcombine_s32(t32[2], t32[3]);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
+                                 int16x8_t a3, int16x8_t a4, int16x8_t a5,
+                                 int16x8_t a6, int16x8_t a7, uint16_t *dest,
+                                 const int stride, const int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  const uint16_t *dst = dest;
+  uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+  uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
+  int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
+
+  d0 = vld1q_u16(dst);
+  dst += stride;
+  d1 = vld1q_u16(dst);
+  dst += stride;
+  d2 = vld1q_u16(dst);
+  dst += stride;
+  d3 = vld1q_u16(dst);
+  dst += stride;
+  d4 = vld1q_u16(dst);
+  dst += stride;
+  d5 = vld1q_u16(dst);
+  dst += stride;
+  d6 = vld1q_u16(dst);
+  dst += stride;
+  d7 = vld1q_u16(dst);
+
+  d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0));
+  d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1));
+  d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2));
+  d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3));
+  d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4));
+  d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5));
+  d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6));
+  d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7));
+
+  d0_s16 = vminq_s16(d0_s16, max);
+  d1_s16 = vminq_s16(d1_s16, max);
+  d2_s16 = vminq_s16(d2_s16, max);
+  d3_s16 = vminq_s16(d3_s16, max);
+  d4_s16 = vminq_s16(d4_s16, max);
+  d5_s16 = vminq_s16(d5_s16, max);
+  d6_s16 = vminq_s16(d6_s16, max);
+  d7_s16 = vminq_s16(d7_s16, max);
+  d0_u16 = vqshluq_n_s16(d0_s16, 0);
+  d1_u16 = vqshluq_n_s16(d1_s16, 0);
+  d2_u16 = vqshluq_n_s16(d2_s16, 0);
+  d3_u16 = vqshluq_n_s16(d3_s16, 0);
+  d4_u16 = vqshluq_n_s16(d4_s16, 0);
+  d5_u16 = vqshluq_n_s16(d5_s16, 0);
+  d6_u16 = vqshluq_n_s16(d6_s16, 0);
+  d7_u16 = vqshluq_n_s16(d7_s16, 0);
+
+  vst1q_u16(dest, d0_u16);
+  dest += stride;
+  vst1q_u16(dest, d1_u16);
+  dest += stride;
+  vst1q_u16(dest, d2_u16);
+  dest += stride;
+  vst1q_u16(dest, d3_u16);
+  dest += stride;
+  vst1q_u16(dest, d4_u16);
+  dest += stride;
+  vst1q_u16(dest, d5_u16);
+  dest += stride;
+  vst1q_u16(dest, d6_u16);
+  dest += stride;
+  vst1q_u16(dest, d7_u16);
+}
+
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int32x4_t a0 = vld1q_s32(input);
+  int32x4_t a1 = vld1q_s32(input + 8);
+  int32x4_t a2 = vld1q_s32(input + 16);
+  int32x4_t a3 = vld1q_s32(input + 24);
+  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+
+  if (bd == 8) {
+    const int16x8_t cospis = vld1q_s16(kCospi);
+    const int16x8_t cospisd = vaddq_s16(cospis, cospis);
+    const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
+    const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
+    const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
+    int16x4_t b0 = vmovn_s32(a0);
+    int16x4_t b1 = vmovn_s32(a1);
+    int16x4_t b2 = vmovn_s32(a2);
+    int16x4_t b3 = vmovn_s32(a3);
+    int16x4_t b4, b5, b6, b7;
+
+    idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4,
+                         &b5, &b6, &b7);
+    idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5,
+                         b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7);
+    c0 = vrshrq_n_s16(c0, 5);
+    c1 = vrshrq_n_s16(c1, 5);
+    c2 = vrshrq_n_s16(c2, 5);
+    c3 = vrshrq_n_s16(c3, 5);
+    c4 = vrshrq_n_s16(c4, 5);
+    c5 = vrshrq_n_s16(c5, 5);
+    c6 = vrshrq_n_s16(c6, 5);
+    c7 = vrshrq_n_s16(c7, 5);
+  } else {
+    const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
+    const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
+    int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15;
+
+    if (bd == 10) {
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+                             &a6, &a7);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
+                             &a10, &a11);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
+                             &a14, &a15);
+    } else {
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+                             &a6, &a7);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
+                             &a10, &a11);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
+                             &a14, &a15);
+    }
+    c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
+    c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
+    c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
+    c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
+    c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
+    c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
+    c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
+    c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+  }
+  highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
+}
+
+static INLINE void idct8x8_64_half1d_bd10(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x4_t step1[8], step2[8];
+
+  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+  // stage 1
+  step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+  step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+  step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+  step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+
+  step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
+  step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
+  step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
+  step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
+
+  step1[4] = vrshrq_n_s32(step1[4], 14);
+  step1[5] = vrshrq_n_s32(step1[5], 14);
+  step1[6] = vrshrq_n_s32(step1[6], 14);
+  step1[7] = vrshrq_n_s32(step1[7], 14);
+
+  // stage 2
+  step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+  step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+  step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+
+  step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+  step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+  step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
+  step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
+
+  step2[0] = vrshrq_n_s32(step2[0], 14);
+  step2[1] = vrshrq_n_s32(step2[1], 14);
+  step2[2] = vrshrq_n_s32(step2[2], 14);
+  step2[3] = vrshrq_n_s32(step2[3], 14);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[0], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[0], step2[3]);
+
+  step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+  step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[5] = vrshrq_n_s32(step1[5], 14);
+  step1[6] = vrshrq_n_s32(step1[6], 14);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_half1d_bd12(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
+      input_7l, input_7h;
+  int32x2_t step1l[4], step1h[4];
+  int32x4_t step1[8], step2[8];
+  int64x2_t t64[8];
+  int32x2_t t32[8];
+
+  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+  // stage 1
+  input_1l = vget_low_s32(*io1);
+  input_1h = vget_high_s32(*io1);
+  input_3l = vget_low_s32(*io3);
+  input_3h = vget_high_s32(*io3);
+  input_5l = vget_low_s32(*io5);
+  input_5h = vget_high_s32(*io5);
+  input_7l = vget_low_s32(*io7);
+  input_7h = vget_high_s32(*io7);
+  step1l[0] = vget_low_s32(*io0);
+  step1h[0] = vget_high_s32(*io0);
+  step1l[1] = vget_low_s32(*io2);
+  step1h[1] = vget_high_s32(*io2);
+  step1l[2] = vget_low_s32(*io4);
+  step1h[2] = vget_high_s32(*io4);
+  step1l[3] = vget_low_s32(*io6);
+  step1h[3] = vget_high_s32(*io6);
+
+  t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
+  t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
+  t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
+  t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
+  t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
+  t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
+  t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
+  t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
+  t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0);
+  t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0);
+  t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1);
+  t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1);
+  t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0);
+  t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
+  t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
+  t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
+  t32[0] = vrshrn_n_s64(t64[0], 14);
+  t32[1] = vrshrn_n_s64(t64[1], 14);
+  t32[2] = vrshrn_n_s64(t64[2], 14);
+  t32[3] = vrshrn_n_s64(t64[3], 14);
+  t32[4] = vrshrn_n_s64(t64[4], 14);
+  t32[5] = vrshrn_n_s64(t64[5], 14);
+  t32[6] = vrshrn_n_s64(t64[6], 14);
+  t32[7] = vrshrn_n_s64(t64[7], 14);
+  step1[4] = vcombine_s32(t32[0], t32[1]);
+  step1[5] = vcombine_s32(t32[2], t32[3]);
+  step1[6] = vcombine_s32(t32[4], t32[5]);
+  step1[7] = vcombine_s32(t32[6], t32[7]);
+
+  // stage 2
+  t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+  t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+  t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+  t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+  t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+  t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+  t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+  t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+  t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+  t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
+  t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
+  t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
+  t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
+  t32[0] = vrshrn_n_s64(t64[0], 14);
+  t32[1] = vrshrn_n_s64(t64[1], 14);
+  t32[2] = vrshrn_n_s64(t64[2], 14);
+  t32[3] = vrshrn_n_s64(t64[3], 14);
+  t32[4] = vrshrn_n_s64(t64[4], 14);
+  t32[5] = vrshrn_n_s64(t64[5], 14);
+  t32[6] = vrshrn_n_s64(t64[6], 14);
+  t32[7] = vrshrn_n_s64(t64[7], 14);
+  step2[0] = vcombine_s32(t32[0], t32[1]);
+  step2[1] = vcombine_s32(t32[2], t32[3]);
+  step2[2] = vcombine_s32(t32[4], t32[5]);
+  step2[3] = vcombine_s32(t32[6], t32[7]);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[0], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[0], step2[3]);
+
+  t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[0] =
+      vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t64[2] =
+      vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t32[0] = vrshrn_n_s64(t64[0], 14);
+  t32[1] = vrshrn_n_s64(t64[1], 14);
+  t32[2] = vrshrn_n_s64(t64[2], 14);
+  t32[3] = vrshrn_n_s64(t64[3], 14);
+  step1[5] = vcombine_s32(t32[0], t32[1]);
+  step1[6] = vcombine_s32(t32[2], t32[3]);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int32x4_t a0 = vld1q_s32(input);
+  int32x4_t a1 = vld1q_s32(input + 4);
+  int32x4_t a2 = vld1q_s32(input + 8);
+  int32x4_t a3 = vld1q_s32(input + 12);
+  int32x4_t a4 = vld1q_s32(input + 16);
+  int32x4_t a5 = vld1q_s32(input + 20);
+  int32x4_t a6 = vld1q_s32(input + 24);
+  int32x4_t a7 = vld1q_s32(input + 28);
+  int32x4_t a8 = vld1q_s32(input + 32);
+  int32x4_t a9 = vld1q_s32(input + 36);
+  int32x4_t a10 = vld1q_s32(input + 40);
+  int32x4_t a11 = vld1q_s32(input + 44);
+  int32x4_t a12 = vld1q_s32(input + 48);
+  int32x4_t a13 = vld1q_s32(input + 52);
+  int32x4_t a14 = vld1q_s32(input + 56);
+  int32x4_t a15 = vld1q_s32(input + 60);
+  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+
+  if (bd == 8) {
+    const int16x8_t cospis = vld1q_s16(kCospi);
+    const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+    const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+    int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1));
+    int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3));
+    int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5));
+    int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7));
+    int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9));
+    int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11));
+    int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13));
+    int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15));
+
+    idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
+    idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
+
+    c0 = vrshrq_n_s16(b0, 5);
+    c1 = vrshrq_n_s16(b1, 5);
+    c2 = vrshrq_n_s16(b2, 5);
+    c3 = vrshrq_n_s16(b3, 5);
+    c4 = vrshrq_n_s16(b4, 5);
+    c5 = vrshrq_n_s16(b5, 5);
+    c6 = vrshrq_n_s16(b6, 5);
+    c7 = vrshrq_n_s16(b7, 5);
+  } else {
+    const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
+    const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
+
+    if (bd == 10) {
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+                             &a6, &a7);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
+                             &a14, &a15);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
+                             &a3, &a11);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
+                             &a7, &a15);
+    } else {
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+                             &a6, &a7);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
+                             &a14, &a15);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
+                             &a3, &a11);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
+                             &a7, &a15);
+    }
+    c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
+    c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
+    c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
+    c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
+    c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
+    c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
+    c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
+    c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+  }
+  highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
new file mode 100644
index 00000000000..6f7e5da7627
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -0,0 +1,1078 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) {
+  const uint16x4_t ref_u16 = vld1_u16(ref);
+  const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16);
+  return vpadd_u16(p0, p0);
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+                                const uint16x4_t dc) {
+  const uint16x4_t dc_dup = vdup_lane_u16(dc, 0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    vst1_u16(dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const uint16x4_t a = vld1_u16(above);
+  const uint16x4_t l = vld1_u16(left);
+  uint16x4_t sum;
+  uint16x4_t dc;
+  (void)bd;
+  sum = vadd_u16(a, l);
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vrshr_n_u16(sum, 3);
+  dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_4(left);
+  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  (void)above;
+  (void)bd;
+  dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_4(above);
+  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_4x4(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) {
+  const uint16x8_t ref_u16 = vld1q_u16(ref);
+  uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+                                const uint16x4_t dc) {
+  const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    vst1q_u16(dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const uint16x8_t above_u16 = vld1q_u16(above);
+  const uint16x8_t left_u16 = vld1q_u16(left);
+  const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  uint16x4_t dc;
+  (void)bd;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vrshr_n_u16(sum, 4);
+  dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_8(left);
+  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  (void)above;
+  (void)bd;
+  dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_8(above);
+  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  (void)left;
+  (void)bd;
+  dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_8x8(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
+  const uint16x8x2_t ref_u16 = vld2q_u16(ref);
+  const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+                                  const uint16x4_t dc) {
+  uint16x8x2_t dc_dup;
+  int i;
+  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
+  for (i = 0; i < 16; ++i, dst += stride) {
+    vst2q_u16(dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8x2_t a = vld2q_u16(above);
+  const uint16x8x2_t l = vld2q_u16(left);
+  const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]);
+  const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]);
+  const uint16x8_t pal0 = vaddq_u16(pa, pl);
+  uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
+  uint32x2_t sum;
+  uint16x4_t dc;
+  (void)bd;
+  pal1 = vpadd_u16(pal1, pal1);
+  sum = vpaddl_u16(pal1);
+  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_16(left);
+  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  (void)above;
+  (void)bd;
+  dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_16(above);
+  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  (void)left;
+  (void)bd;
+  dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_16x16(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
+  const uint16x8x4_t r = vld4q_u16(ref);
+  const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]);
+  const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]);
+  const uint16x8_t p2 = vaddq_u16(p0, p1);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+  sum = vpadd_u16(sum, sum);
+  return vpaddl_u16(sum);
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+                                  const uint16x4_t dc) {
+  uint16x8x2_t dc_dup;
+  int i;
+  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
+
+  for (i = 0; i < 32; ++i) {
+    vst2q_u16(dst, dc_dup);
+    dst += 16;
+    vst2q_u16(dst, dc_dup);
+    dst += stride - 16;
+  }
+}
+
+void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8x4_t a = vld4q_u16(above);
+  const uint16x8x4_t l = vld4q_u16(left);
+  const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]);
+  const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]);
+  const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]);
+  const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]);
+  const uint16x8_t pa = vaddq_u16(pa0, pa1);
+  const uint16x8_t pl = vaddq_u16(pl0, pl1);
+  const uint16x8_t pal0 = vaddq_u16(pa, pl);
+  const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
+  uint32x2_t sum = vpaddl_u16(pal1);
+  uint16x4_t dc;
+  (void)bd;
+  sum = vpadd_u32(sum, sum);
+  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6));
+  dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const uint32x2_t sum = dc_sum_32(left);
+  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  (void)above;
+  (void)bd;
+  dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint32x2_t sum = dc_sum_32(above);
+  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  (void)left;
+  (void)bd;
+  dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_32x32(dst, stride, dc);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8_t ABCDEFGH = vld1q_u16(above);
+  const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
+  const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
+  const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
+  const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
+  const uint16x4_t avg2_low = vget_low_u16(avg2);
+  const uint16x4_t avg2_high = vget_high_u16(avg2);
+  const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
+  const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
+  const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
+  (void)left;
+  (void)bd;
+  vst1_u16(dst, avg2_low);
+  dst += stride;
+  vst1_u16(dst, r1);
+  dst += stride;
+  vst1_u16(dst, r2);
+  dst += stride;
+  vst1_u16(dst, r3);
+  vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+                               const uint16x8_t above_right, uint16x8_t *row) {
+  *row = vextq_u16(*row, above_right, 1);
+  vst1q_u16(*dst, *row);
+  *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8_t A0 = vld1q_u16(above);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
+  const uint16x8_t A1 = vld1q_u16(above + 1);
+  const uint16x8_t A2 = vld1q_u16(above + 2);
+  const uint16x8_t avg1 = vhaddq_u16(A0, A2);
+  uint16x8_t row = vrhaddq_u16(avg1, A1);
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row);
+  dst += stride;
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  vst1q_u16(dst, above_right);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+                                const uint16x8_t above_right, uint16x8_t *row_0,
+                                uint16x8_t *row_1) {
+  *row_0 = vextq_u16(*row_0, *row_1, 1);
+  *row_1 = vextq_u16(*row_1, above_right, 1);
+  vst1q_u16(*dst, *row_0);
+  *dst += 8;
+  vst1q_u16(*dst, *row_1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const uint16x8_t A0_0 = vld1q_u16(above);
+  const uint16x8_t A0_1 = vld1q_u16(above + 8);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
+  const uint16x8_t A1_0 = vld1q_u16(above + 1);
+  const uint16x8_t A1_1 = vld1q_u16(above + 9);
+  const uint16x8_t A2_0 = vld1q_u16(above + 2);
+  const uint16x8_t A2_1 = vld1q_u16(above + 10);
+  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row_0);
+  vst1q_u16(dst + 8, row_1);
+  dst += stride;
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  vst1q_u16(dst, above_right);
+  vst1q_u16(dst + 8, above_right);
+}
+
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const uint16x8_t A0_0 = vld1q_u16(above);
+  const uint16x8_t A0_1 = vld1q_u16(above + 8);
+  const uint16x8_t A0_2 = vld1q_u16(above + 16);
+  const uint16x8_t A0_3 = vld1q_u16(above + 24);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
+  const uint16x8_t A1_0 = vld1q_u16(above + 1);
+  const uint16x8_t A1_1 = vld1q_u16(above + 9);
+  const uint16x8_t A1_2 = vld1q_u16(above + 17);
+  const uint16x8_t A1_3 = vld1q_u16(above + 25);
+  const uint16x8_t A2_0 = vld1q_u16(above + 2);
+  const uint16x8_t A2_1 = vld1q_u16(above + 10);
+  const uint16x8_t A2_2 = vld1q_u16(above + 18);
+  const uint16x8_t A2_3 = vld1q_u16(above + 26);
+  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+  const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
+  const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
+  uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
+  int i;
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row_0);
+  dst += 8;
+  vst1q_u16(dst, row_1);
+  dst += 8;
+  vst1q_u16(dst, row_2);
+  dst += 8;
+  vst1q_u16(dst, row_3);
+  dst += stride - 24;
+
+  for (i = 0; i < 30; ++i) {
+    row_0 = vextq_u16(row_0, row_1, 1);
+    row_1 = vextq_u16(row_1, row_2, 1);
+    row_2 = vextq_u16(row_2, row_3, 1);
+    row_3 = vextq_u16(row_3, above_right, 1);
+    vst1q_u16(dst, row_0);
+    dst += 8;
+    vst1q_u16(dst, row_1);
+    dst += 8;
+    vst1q_u16(dst, row_2);
+    dst += 8;
+    vst1q_u16(dst, row_3);
+    dst += stride - 24;
+  }
+
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8_t XA0123___ = vld1q_u16(above - 1);
+  const uint16x4_t L0123 = vld1_u16(left);
+  const uint16x4_t L3210 = vrev64_u16(L0123);
+  const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
+  const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
+  const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
+  const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
+  const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
+  const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
+  const uint16x4_t row_0 = vget_low_u16(avg2);
+  const uint16x4_t row_1 = vget_high_u16(avg2);
+  const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
+  const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
+  const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
+  (void)bd;
+  vst1_u16(dst, r0);
+  dst += stride;
+  vst1_u16(dst, r1);
+  dst += stride;
+  vst1_u16(dst, r2);
+  dst += stride;
+  vst1_u16(dst, row_0);
+}
+
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+  const uint16x8_t A01234567 = vld1q_u16(above);
+  const uint16x8_t A1234567_ = vld1q_u16(above + 1);
+  const uint16x8_t L01234567 = vld1q_u16(left);
+  const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+  const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+  const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+  const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+  const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
+  const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
+  const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
+  const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
+  const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
+  const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
+  const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
+  const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
+  const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
+  const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
+  const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
+  (void)bd;
+  vst1q_u16(dst, r0);
+  dst += stride;
+  vst1q_u16(dst, r1);
+  dst += stride;
+  vst1q_u16(dst, r2);
+  dst += stride;
+  vst1q_u16(dst, r3);
+  dst += stride;
+  vst1q_u16(dst, r4);
+  dst += stride;
+  vst1q_u16(dst, r5);
+  dst += stride;
+  vst1q_u16(dst, r6);
+  dst += stride;
+  vst1q_u16(dst, row_0);
+}
+
+static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
+                                 const uint16x8_t row_0,
+                                 const uint16x8_t row_1) {
+  vst1q_u16(*dst, row_0);
+  *dst += 8;
+  vst1q_u16(*dst, row_1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x8_t L01234567 = vld1q_u16(left);
+  const uint16x8_t L89abcdef = vld1q_u16(left + 8);
+  const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+  const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+  const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
+  const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
+  const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+  const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
+  const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
+  const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
+  const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
+
+  const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+  const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+  const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+  const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
+  const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
+
+  const uint16x8_t A01234567 = vld1q_u16(above);
+  const uint16x8_t A12345678 = vld1q_u16(above + 1);
+  const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
+  const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
+
+  const uint16x8_t A789abcde = vld1q_u16(above + 7);
+  const uint16x8_t A89abcdef = vld1q_u16(above + 8);
+  const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
+  const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
+  const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
+
+  const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
+  const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
+  const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
+  const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
+  const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
+  const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
+  const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
+  const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
+  const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
+  const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
+  const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
+  const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
+  const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
+  const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
+  const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
+  const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
+  const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
+  const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
+  const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
+  const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
+  const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
+  (void)bd;
+
+  d135_store_16(&dst, stride, r0_0, r0_1);
+  d135_store_16(&dst, stride, r1_0, r1_1);
+  d135_store_16(&dst, stride, r2_0, r2_1);
+  d135_store_16(&dst, stride, r3_0, r3_1);
+  d135_store_16(&dst, stride, r4_0, r4_1);
+  d135_store_16(&dst, stride, r5_0, r5_1);
+  d135_store_16(&dst, stride, r6_0, r6_1);
+  d135_store_16(&dst, stride, row_1, row_2);
+  d135_store_16(&dst, stride, r8_0, r0_0);
+  d135_store_16(&dst, stride, r9_0, r1_0);
+  d135_store_16(&dst, stride, ra_0, r2_0);
+  d135_store_16(&dst, stride, rb_0, r3_0);
+  d135_store_16(&dst, stride, rc_0, r4_0);
+  d135_store_16(&dst, stride, rd_0, r5_0);
+  d135_store_16(&dst, stride, re_0, r6_0);
+  vst1q_u16(dst, row_0);
+  dst += 8;
+  vst1q_u16(dst, row_1);
+}
+
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x8_t LL01234567 = vld1q_u16(left + 16);
+  const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
+  const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
+  const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
+  const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
+  const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
+  const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
+  const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
+  const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
+  const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
+
+  const uint16x8_t LU01234567 = vld1q_u16(left);
+  const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
+  const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
+  const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
+  const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
+  const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
+  const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
+  const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
+  const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
+  const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
+  const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
+
+  const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
+  const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
+  const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
+  uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
+
+  const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
+  const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
+  const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
+  const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
+  uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
+
+  const uint16x8_t AL01234567 = vld1q_u16(above);
+  const uint16x8_t AL12345678 = vld1q_u16(above + 1);
+  const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
+  uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
+
+  const uint16x8_t AL789abcde = vld1q_u16(above + 7);
+  const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
+  const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
+  const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
+  uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
+
+  const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
+  const uint16x8_t AR01234567 = vld1q_u16(above + 16);
+  const uint16x8_t AR12345678 = vld1q_u16(above + 17);
+  const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
+  uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
+
+  const uint16x8_t AR789abcde = vld1q_u16(above + 23);
+  const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
+  const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
+  const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
+  uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
+  int i, j;
+  (void)bd;
+
+  dst += 31 * stride;
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 8; ++j) {
+      vst1q_u16(dst, row_0);
+      dst += 8;
+      vst1q_u16(dst, row_1);
+      dst += 8;
+      vst1q_u16(dst, row_2);
+      dst += 8;
+      vst1q_u16(dst, row_3);
+      dst -= stride + 24;
+      row_0 = vextq_u16(row_0, row_1, 1);
+      row_1 = vextq_u16(row_1, row_2, 1);
+      row_2 = vextq_u16(row_2, row_3, 1);
+      row_3 = vextq_u16(row_3, row_4, 1);
+      row_4 = vextq_u16(row_4, row_4, 1);
+    }
+    row_4 = row_5;
+    row_5 = row_6;
+    row_6 = row_7;
+  }
+}
+
+//------------------------------------------------------------------------------
+
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x4_t row = vld1_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, dst += stride) {
+    vst1_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x8_t row = vld1q_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 8; i++, dst += stride) {
+    vst1q_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8x2_t row = vld2q_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 16; i++, dst += stride) {
+    vst2q_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8x2_t row0 = vld2q_u16(above);
+  const uint16x8x2_t row1 = vld2q_u16(above + 16);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 32; i++) {
+    vst2q_u16(dst, row0);
+    dst += 16;
+    vst2q_u16(dst, row1);
+    dst += stride - 16;
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x4_t left_u16 = vld1_u16(left);
+  uint16x4_t row;
+  (void)above;
+  (void)bd;
+
+  row = vdup_lane_u16(left_u16, 0);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 1);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 2);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 3);
+  vst1_u16(dst, row);
+}
+
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x8_t left_u16 = vld1q_u16(left);
+  const uint16x4_t left_low = vget_low_u16(left_u16);
+  const uint16x4_t left_high = vget_high_u16(left_u16);
+  uint16x8_t row;
+  (void)above;
+  (void)bd;
+
+  row = vdupq_lane_u16(left_low, 0);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 1);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 2);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 3);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 0);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 1);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 2);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 3);
+  vst1q_u16(dst, row);
+}
+
+static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
+                              const uint16x8_t row) {
+  // Note: vst1q is faster than vst2q
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    const uint16x8_t left_u16q = vld1q_u16(left);
+    const uint16x4_t left_low = vget_low_u16(left_u16q);
+    const uint16x4_t left_high = vget_high_u16(left_u16q);
+    uint16x8_t row;
+
+    row = vdupq_lane_u16(left_low, 0);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 1);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 2);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 3);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 0);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 1);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 2);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 3);
+    h_store_16(&dst, stride, row);
+  }
+}
+
+static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
+                              const uint16x8_t row) {
+  // Note: vst1q is faster than vst2q
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += stride - 24;
+}
+
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    const uint16x8_t left_u16q = vld1q_u16(left);
+    const uint16x4_t left_low = vget_low_u16(left_u16q);
+    const uint16x4_t left_high = vget_high_u16(left_u16q);
+    uint16x8_t row;
+
+    row = vdupq_lane_u16(left_low, 0);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 1);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 2);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 3);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 0);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 1);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 2);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 3);
+    h_store_32(&dst, stride, row);
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
+  const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
+  const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
+  const int16x8_t sub = vsubq_s16(above_s16, top_left);
+  int16x8_t sum;
+  uint16x8_t row;
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+  sum = vaddq_s16(sum, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1_u16(dst, vget_low_u16(row));
+  dst += stride;
+  vst1_u16(dst, vget_high_u16(row));
+  dst += stride;
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+  sum = vaddq_s16(sum, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1_u16(dst, vget_low_u16(row));
+  dst += stride;
+  vst1_u16(dst, vget_high_u16(row));
+}
+
+static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
+                               const int16x8_t left_dup, const int16x8_t sub,
+                               const int16x8_t max) {
+  uint16x8_t row;
+  int16x8_t sum = vaddq_s16(left_dup, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1q_u16(*dst, row);
+  *dst += stride;
+}
+
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
+  const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
+  const int16x8_t sub = vsubq_s16(above_s16, top_left);
+  int16x4_t left_s16d;
+  int16x8_t left_dup;
+  int i;
+
+  left_s16d = vget_low_s16(left_s16);
+
+  for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
+    left_dup = vdupq_lane_s16(left_s16d, 0);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 1);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 2);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 3);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+  }
+}
+
+static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1, const int16x8_t max) {
+  uint16x8_t row0, row1;
+  int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  sum0 = vminq_s16(sum0, max);
+  sum1 = vminq_s16(sum1, max);
+  row0 = vqshluq_n_s16(sum0, 0);
+  row1 = vqshluq_n_s16(sum1, 0);
+  vst1q_u16(*dst, row0);
+  *dst += 8;
+  vst1q_u16(*dst, row1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+  const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+  const int16x8_t sub0 = vsubq_s16(above0, top_left);
+  const int16x8_t sub1 = vsubq_s16(above1, top_left);
+  int16x8_t left_dup;
+  int i, j;
+
+  for (j = 0; j < 2; j++, left += 8) {
+    const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+    int16x4_t left_s16d = vget_low_s16(left_s16q);
+    for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+      left_dup = vdupq_lane_s16(left_s16d, 0);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 1);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 2);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 3);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+    }
+  }
+}
+
+static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1, const int16x8_t sub2,
+                                const int16x8_t sub3, const int16x8_t max) {
+  uint16x8_t row0, row1, row2, row3;
+  int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+  int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+  sum0 = vminq_s16(sum0, max);
+  sum1 = vminq_s16(sum1, max);
+  sum2 = vminq_s16(sum2, max);
+  sum3 = vminq_s16(sum3, max);
+  row0 = vqshluq_n_s16(sum0, 0);
+  row1 = vqshluq_n_s16(sum1, 0);
+  row2 = vqshluq_n_s16(sum2, 0);
+  row3 = vqshluq_n_s16(sum3, 0);
+  vst1q_u16(*dst, row0);
+  *dst += 8;
+  vst1q_u16(*dst, row1);
+  *dst += 8;
+  vst1q_u16(*dst, row2);
+  *dst += 8;
+  vst1q_u16(*dst, row3);
+  *dst += stride - 24;
+}
+
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+  const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+  const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
+  const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
+  const int16x8_t sub0 = vsubq_s16(above0, top_left);
+  const int16x8_t sub1 = vsubq_s16(above1, top_left);
+  const int16x8_t sub2 = vsubq_s16(above2, top_left);
+  const int16x8_t sub3 = vsubq_s16(above3, top_left);
+  int16x8_t left_dup;
+  int i, j;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+    int16x4_t left_s16d = vget_low_s16(left_s16q);
+    for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
+      left_dup = vdupq_lane_s16(left_s16d, 0);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 1);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 2);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 3);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+    }
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
index e3c0c5210d2..d648840df40 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
@@ -15,12 +15,11 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
-;                                    int dest_stride)
+;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
-; r2  int dest_stride)
+; r2  int stride)
 
 |vpx_idct16x16_1_add_neon| PROC
     ldrsh            r0, [r0]
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
index f1e49ff5178..968bc5cc3ab 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -11,49 +11,66 @@
 #include <arm_neon.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
 
-void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
-                              int dest_stride) {
-  uint8x8_t d2u8, d3u8, d30u8, d31u8;
-  uint64x1_t d2u64, d3u64, d4u64, d5u64;
-  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, j, a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  q0s16 = vdupq_n_s16(a1);
-  q0u16 = vreinterpretq_u16_s16(q0s16);
-
-  for (d1 = d2 = dest, i = 0; i < 4; i++) {
-    for (j = 0; j < 2; j++) {
-      d2u64 = vld1_u64((const uint64_t *)d1);
-      d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
-      d1 += dest_stride;
-      d4u64 = vld1_u64((const uint64_t *)d1);
-      d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
-      d1 += dest_stride;
+static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride,
+                                              const uint8x16_t res) {
+  const uint8x16_t a = vld1q_u8(*dest);
+  const uint8x16_t b = vqaddq_u8(a, res);
+  vst1q_u8(*dest, b);
+  *dest += stride;
+}
 
-      q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-      q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-      q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-      q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
+                                              const uint8x16_t res) {
+  const uint8x16_t a = vld1q_u8(*dest);
+  const uint8x16_t b = vqsubq_u8(a, res);
+  vst1q_u8(*dest, b);
+  *dest += stride;
+}
 
-      d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-      d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-      d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-      d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
 
-      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
-      d2 += dest_stride;
-      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
-      d2 += dest_stride;
-    }
+  if (a1 >= 0) {
+    const uint8x16_t dc = create_dcq(a1);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+  } else {
+    const uint8x16_t dc = create_dcq(-a1);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
index 5e64cea0ae7..ea6b099d3bb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
@@ -8,8 +8,14 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
+    INCLUDE vpx_dsp/arm/idct_neon.asm.S
+
     EXPORT  |vpx_idct16x16_256_add_neon_pass1|
     EXPORT  |vpx_idct16x16_256_add_neon_pass2|
+    IF CONFIG_VP9_HIGHBITDEPTH
+    EXPORT  |vpx_idct16x16_256_add_neon_pass1_tran_low|
+    EXPORT  |vpx_idct16x16_256_add_neon_pass2_tran_low|
+    ENDIF
     EXPORT  |vpx_idct16x16_10_add_neon_pass1|
     EXPORT  |vpx_idct16x16_10_add_neon_pass2|
     ARM
@@ -36,12 +42,10 @@
     MEND
 
     AREA    Block, CODE, READONLY ; name this block of code
-;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input,
-;                                          int16_t *output, int output_stride)
+;void |vpx_idct16x16_256_add_neon_pass1|(const int16_t *input, int16_t *output)
 ;
-; r0  int16_t input
+; r0  const int16_t *input
 ; r1  int16_t *output
-; r2  int  output_stride)
 
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
@@ -60,6 +64,7 @@
     vld2.s16        {q1,q2}, [r0]!
     vmov.s16        q15, q1
 
+idct16x16_256_add_neon_pass1
     ; cospi_28_64 = 3196
     movw            r3, #0x0c7c
 
@@ -100,12 +105,12 @@
     vdup.16         d3, r12                   ; duplicate cospi_20_64
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
+    vrshrn.s32      d8, q2, #14               ; >> 14
+    vrshrn.s32      d9, q3, #14               ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d14, q5, #14              ; >> 14
-    vqrshrn.s32     d15, q6, #14              ; >> 14
+    vrshrn.s32      d14, q5, #14              ; >> 14
+    vrshrn.s32      d15, q6, #14              ; >> 14
 
     ; preloading to avoid stall
     ; cospi_16_64 = 11585
@@ -131,12 +136,12 @@
     vmlal.s16       q15, d23, d2
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q2, #14              ; >> 14
-    vqrshrn.s32     d11, q3, #14              ; >> 14
+    vrshrn.s32      d10, q2, #14              ; >> 14
+    vrshrn.s32      d11, q3, #14              ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q15, #14             ; >> 14
+    vrshrn.s32      d12, q9, #14              ; >> 14
+    vrshrn.s32      d13, q15, #14             ; >> 14
 
     ; stage 4
     vdup.16         d30, r3                   ; cospi_16_64
@@ -164,12 +169,12 @@
     vsub.s32        q1, q11, q1
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d16, q3, #14              ; >> 14
-    vqrshrn.s32     d17, q12, #14             ; >> 14
+    vrshrn.s32      d16, q3, #14              ; >> 14
+    vrshrn.s32      d17, q12, #14             ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d18, q13, #14             ; >> 14
-    vqrshrn.s32     d19, q1, #14              ; >> 14
+    vrshrn.s32      d18, q13, #14             ; >> 14
+    vrshrn.s32      d19, q1, #14              ; >> 14
 
     ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
     ; step1[2] * cospi_8_64
@@ -189,12 +194,12 @@
     vmlsl.s16       q13, d29, d31
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d22, q0, #14              ; >> 14
-    vqrshrn.s32     d23, q1, #14              ; >> 14
+    vrshrn.s32      d22, q0, #14              ; >> 14
+    vrshrn.s32      d23, q1, #14              ; >> 14
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d20, q12, #14             ; >> 14
-    vqrshrn.s32     d21, q13, #14             ; >> 14
+    vrshrn.s32      d20, q12, #14             ; >> 14
+    vrshrn.s32      d21, q13, #14             ; >> 14
 
     vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5];
     vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5];
@@ -229,15 +234,15 @@
     vadd.s32        q10, q10, q12
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q6, #14              ; >> 14
-    vqrshrn.s32     d11, q13, #14             ; >> 14
+    vrshrn.s32      d10, q6, #14              ; >> 14
+    vrshrn.s32      d11, q13, #14             ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
+    vrshrn.s32      d12, q9, #14              ; >> 14
+    vrshrn.s32      d13, q10, #14             ; >> 14
 
     ; stage 6
-    vadd.s16        q8, q0, q15                ; step2[0] = step1[0] + step1[7];
+    vadd.s16        q8, q0, q15               ; step2[0] = step1[0] + step1[7];
     vadd.s16        q9, q1, q6                ; step2[1] = step1[1] + step1[6];
     vadd.s16        q10, q2, q5               ; step2[2] = step1[2] + step1[5];
     vadd.s16        q11, q3, q4               ; step2[3] = step1[3] + step1[4];
@@ -247,46 +252,54 @@
     vsub.s16        q15, q0, q15              ; step2[7] = step1[0] - step1[7];
 
     ; store the data
-    vst1.64         {d16}, [r1], r2
-    vst1.64         {d17}, [r1], r2
-    vst1.64         {d18}, [r1], r2
-    vst1.64         {d19}, [r1], r2
-    vst1.64         {d20}, [r1], r2
-    vst1.64         {d21}, [r1], r2
-    vst1.64         {d22}, [r1], r2
-    vst1.64         {d23}, [r1], r2
-    vst1.64         {d24}, [r1], r2
-    vst1.64         {d25}, [r1], r2
-    vst1.64         {d26}, [r1], r2
-    vst1.64         {d27}, [r1], r2
-    vst1.64         {d28}, [r1], r2
-    vst1.64         {d29}, [r1], r2
-    vst1.64         {d30}, [r1], r2
-    vst1.64         {d31}, [r1], r2
+    vst1.64         {q8-q9}, [r1]!
+    vst1.64         {q10-q11}, [r1]!
+    vst1.64         {q12-q13}, [r1]!
+    vst1.64         {q14-q15}, [r1]
 
     bx              lr
     ENDP  ; |vpx_idct16x16_256_add_neon_pass1|
 
-;void vpx_idct16x16_256_add_neon_pass2(int16_t *src,
-;                                        int16_t *output,
-;                                        int16_t *pass1Output,
-;                                        int16_t skip_adding,
-;                                        uint8_t *dest,
-;                                        int dest_stride)
+    IF CONFIG_VP9_HIGHBITDEPTH
+;void |vpx_idct16x16_256_add_neon_pass1_tran_low|(const tran_low_t *input,
+;                                                 int16_t *output)
+;
+; r0  const tran_low_t *input
+; r1  int16_t *output
+
+|vpx_idct16x16_256_add_neon_pass1_tran_low| PROC
+    LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
+    LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
+    LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
+    LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
+    LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
+    LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
+    LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
+    LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0
+    vmov.s16        q15, q1
+
+    b               idct16x16_256_add_neon_pass1
+    ENDP  ; |vpx_idct16x16_256_add_neon_pass1_tran_low|
+    ENDIF  ; CONFIG_VP9_HIGHBITDEPTH
+
+;void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
+;                                      int16_t *output,
+;                                      int16_t *pass1_output,
+;                                      int16_t skip_adding,
+;                                      uint8_t *dest,
+;                                      int stride)
 ;
-; r0  int16_t *src
-; r1  int16_t *output,
-; r2  int16_t *pass1Output,
-; r3  int16_t skip_adding,
-; r4  uint8_t *dest,
-; r5  int dest_stride)
+; r0  const int16_t *src
+; r1  int16_t *output
+; r2  int16_t *pass1_output
+; r3  int16_t skip_adding
+; r4  uint8_t *dest
+; r5  int stride
 
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
 |vpx_idct16x16_256_add_neon_pass2| PROC
-    push            {r3-r9}
-
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
     vld2.s16        {q8,q9}, [r0]!
@@ -299,6 +312,9 @@
     vld2.s16        {q0,q1}, [r0]!
     vmov.s16        q15, q0;
 
+idct16x16_256_add_neon_pass2
+    push            {r3-r9}
+
     ; cospi_30_64 = 1606
     movw            r3, #0x0646
 
@@ -339,12 +355,12 @@
     vdup.16         d31, r12                  ; duplicate cospi_18_64
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d0, q2, #14               ; >> 14
-    vqrshrn.s32     d1, q3, #14               ; >> 14
+    vrshrn.s32      d0, q2, #14               ; >> 14
+    vrshrn.s32      d1, q3, #14               ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d14, q1, #14              ; >> 14
-    vqrshrn.s32     d15, q4, #14              ; >> 14
+    vrshrn.s32      d14, q1, #14              ; >> 14
+    vrshrn.s32      d15, q4, #14              ; >> 14
 
     ; preloading to avoid stall
     ; cospi_22_64 = 7723
@@ -373,12 +389,12 @@
     vdup.16         d31, r12                  ; duplicate cospi_10_64
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q2, #14               ; >> 14
-    vqrshrn.s32     d3, q3, #14               ; >> 14
+    vrshrn.s32      d2, q2, #14               ; >> 14
+    vrshrn.s32      d3, q3, #14               ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q4, #14              ; >> 14
-    vqrshrn.s32     d13, q5, #14              ; >> 14
+    vrshrn.s32      d12, q4, #14              ; >> 14
+    vrshrn.s32      d13, q5, #14              ; >> 14
 
     ; step1[10] * cospi_22_64
     vmull.s16       q11, d20, d30
@@ -407,12 +423,12 @@
     vdup.16         d31, r12                  ; duplicate cospi_26_64
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q11, #14              ; >> 14
-    vqrshrn.s32     d5, q12, #14              ; >> 14
+    vrshrn.s32      d4, q11, #14              ; >> 14
+    vrshrn.s32      d5, q12, #14              ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d11, q5, #14              ; >> 14
-    vqrshrn.s32     d10, q4, #14              ; >> 14
+    vrshrn.s32      d11, q5, #14              ; >> 14
+    vrshrn.s32      d10, q4, #14              ; >> 14
 
     ; step1[11] * cospi_6_64
     vmull.s16       q10, d28, d30
@@ -434,12 +450,12 @@
     vadd.s16        q0, q0, q1                ; step1[8]=step2[8]+step2[9]
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q11, #14              ; >> 14
+    vrshrn.s32      d6, q10, #14              ; >> 14
+    vrshrn.s32      d7, q11, #14              ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d8, q12, #14              ; >> 14
-    vqrshrn.s32     d9, q13, #14              ; >> 14
+    vrshrn.s32      d8, q12, #14              ; >> 14
+    vrshrn.s32      d9, q13, #14              ; >> 14
 
     ; stage 3
     vsub.s16        q10, q3, q2               ; step1[10]=-step2[10]+step2[11]
@@ -480,12 +496,12 @@
     vdup.16         d30, r12                  ; duplicate -cospi_8_64
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q2, #14              ; >> 14
-    vqrshrn.s32     d13, q3, #14              ; >> 14
+    vrshrn.s32      d12, q2, #14              ; >> 14
+    vrshrn.s32      d13, q3, #14              ; >> 14
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q4, #14               ; >> 14
-    vqrshrn.s32     d3, q5, #14               ; >> 14
+    vrshrn.s32      d2, q4, #14               ; >> 14
+    vrshrn.s32      d3, q5, #14               ; >> 14
 
     vmov.s16        q3, q11
     vmov.s16        q4, q12
@@ -507,12 +523,12 @@
     vmlal.s16       q9, d27, d31
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d4, q11, #14              ; >> 14
-    vqrshrn.s32     d5, q12, #14              ; >> 14
+    vrshrn.s32      d4, q11, #14              ; >> 14
+    vrshrn.s32      d5, q12, #14              ; >> 14
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q8, #14              ; >> 14
-    vqrshrn.s32     d11, q9, #14              ; >> 14
+    vrshrn.s32      d10, q8, #14              ; >> 14
+    vrshrn.s32      d11, q9, #14              ; >> 14
 
     ; stage 5
     vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
@@ -547,12 +563,12 @@
     vadd.s32        q4, q4, q1
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q5, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
+    vrshrn.s32      d4, q5, #14               ; >> 14
+    vrshrn.s32      d5, q6, #14               ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q10, #14             ; >> 14
-    vqrshrn.s32     d11, q4, #14              ; >> 14
+    vrshrn.s32      d10, q10, #14             ; >> 14
+    vrshrn.s32      d11, q4, #14              ; >> 14
 
     ; step1[11] * cospi_16_64
     vmull.s16       q0, d22, d14
@@ -571,21 +587,21 @@
     vadd.s32        q6, q6, q1
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
+    vrshrn.s32      d6, q10, #14              ; >> 14
+    vrshrn.s32      d7, q4, #14               ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d8, q13, #14              ; >> 14
-    vqrshrn.s32     d9, q6, #14               ; >> 14
+    vrshrn.s32      d8, q13, #14              ; >> 14
+    vrshrn.s32      d9, q6, #14               ; >> 14
 
-    mov              r4, #16                  ; pass1Output stride
+    mov              r4, #16                  ; pass1_output stride
     ldr              r3, [sp]                 ; load skip_adding
     cmp              r3, #0                   ; check if need adding dest data
     beq              skip_adding_dest
 
     ldr              r7, [sp, #28]            ; dest used to save element 0-7
     mov              r9, r7                   ; save dest pointer for later use
-    ldr              r8, [sp, #32]            ; load dest_stride
+    ldr              r8, [sp, #32]            ; load stride
 
     ; stage 7
     ; load the data in pass1
@@ -599,8 +615,8 @@
     vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
     vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
     vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vaddw.u8        q12, q12, d12             ; + dest[j * stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * stride + i]
     vqmovun.s16     d12, q12                  ; clip pixel
     vqmovun.s16     d13, q13                  ; clip pixel
     vst1.64         {d12}, [r9], r8           ; store the data
@@ -613,8 +629,8 @@
     vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
     vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
     vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vaddw.u8        q12, q12, d12             ; + dest[j * stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * stride + i]
     vqmovun.s16     d12, q12                  ; clip pixel
     vqmovun.s16     d13, q13                  ; clip pixel
     vst1.64         {d12}, [r9], r8           ; store the data
@@ -631,8 +647,8 @@
     vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
     vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
     vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vaddw.u8        q12, q12, d12             ; + dest[j * stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * stride + i]
     vqmovun.s16     d12, q12                  ; clip pixel
     vqmovun.s16     d13, q13                  ; clip pixel
     vst1.64         {d12}, [r9], r8           ; store the data
@@ -645,8 +661,8 @@
     vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
     vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
     vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vaddw.u8        q12, q12, d12             ; + dest[j * stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * stride + i]
     vqmovun.s16     d12, q12                  ; clip pixel
     vqmovun.s16     d13, q13                  ; clip pixel
     vst1.64         {d12}, [r9], r8           ; store the data
@@ -658,42 +674,42 @@
 
     ; store the data  output 8,9,10,11,12,13,14,15
     vrshr.s16       q8, q8, #6                ; ROUND_POWER_OF_TWO
-    vaddw.u8        q8, q8, d12               ; + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d12               ; + dest[j * stride + i]
     vqmovun.s16     d12, q8                   ; clip pixel
     vst1.64         {d12}, [r9], r8           ; store the data
     vld1.64         {d12}, [r7], r8           ; load destinatoin data
     vrshr.s16       q9, q9, #6
-    vaddw.u8        q9, q9, d13               ; + dest[j * dest_stride + i]
+    vaddw.u8        q9, q9, d13               ; + dest[j * stride + i]
     vqmovun.s16     d13, q9                   ; clip pixel
     vst1.64         {d13}, [r9], r8           ; store the data
     vld1.64         {d13}, [r7], r8           ; load destinatoin data
     vrshr.s16       q2, q2, #6
-    vaddw.u8        q2, q2, d12               ; + dest[j * dest_stride + i]
+    vaddw.u8        q2, q2, d12               ; + dest[j * stride + i]
     vqmovun.s16     d12, q2                   ; clip pixel
     vst1.64         {d12}, [r9], r8           ; store the data
     vld1.64         {d12}, [r7], r8           ; load destinatoin data
     vrshr.s16       q3, q3, #6
-    vaddw.u8        q3, q3, d13               ; + dest[j * dest_stride + i]
+    vaddw.u8        q3, q3, d13               ; + dest[j * stride + i]
     vqmovun.s16     d13, q3                   ; clip pixel
     vst1.64         {d13}, [r9], r8           ; store the data
     vld1.64         {d13}, [r7], r8           ; load destinatoin data
     vrshr.s16       q4, q4, #6
-    vaddw.u8        q4, q4, d12               ; + dest[j * dest_stride + i]
+    vaddw.u8        q4, q4, d12               ; + dest[j * stride + i]
     vqmovun.s16     d12, q4                   ; clip pixel
     vst1.64         {d12}, [r9], r8           ; store the data
     vld1.64         {d12}, [r7], r8           ; load destinatoin data
     vrshr.s16       q5, q5, #6
-    vaddw.u8        q5, q5, d13               ; + dest[j * dest_stride + i]
+    vaddw.u8        q5, q5, d13               ; + dest[j * stride + i]
     vqmovun.s16     d13, q5                   ; clip pixel
     vst1.64         {d13}, [r9], r8           ; store the data
     vld1.64         {d13}, [r7], r8           ; load destinatoin data
     vrshr.s16       q14, q14, #6
-    vaddw.u8        q14, q14, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q14, q14, d12             ; + dest[j * stride + i]
     vqmovun.s16     d12, q14                  ; clip pixel
     vst1.64         {d12}, [r9], r8           ; store the data
     vld1.64         {d12}, [r7], r8           ; load destinatoin data
     vrshr.s16       q15, q15, #6
-    vaddw.u8        q15, q15, d13             ; + dest[j * dest_stride + i]
+    vaddw.u8        q15, q15, d13             ; + dest[j * stride + i]
     vqmovun.s16     d13, q15                  ; clip pixel
     vst1.64         {d13}, [r9], r8           ; store the data
     b               end_idct16x16_pass2
@@ -767,12 +783,41 @@ end_idct16x16_pass2
     bx              lr
     ENDP  ; |vpx_idct16x16_256_add_neon_pass2|
 
-;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input,
-;                                             int16_t *output, int output_stride)
+    IF CONFIG_VP9_HIGHBITDEPTH
+;void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
+;                                               int16_t *output,
+;                                               int16_t *pass1_output,
+;                                               int16_t skip_adding,
+;                                               uint8_t *dest,
+;                                               int stride)
 ;
-; r0  int16_t input
+; r0  const tran_low_t *src
+; r1  int16_t *output
+; r2  int16_t *pass1_output
+; r3  int16_t skip_adding
+; r4  uint8_t *dest
+; r5  int stride
+
+|vpx_idct16x16_256_add_neon_pass2_tran_low| PROC
+    LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
+    LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
+    LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
+    LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
+    LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
+    LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
+    LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
+    LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0
+    vmov.s16        q15, q0
+
+    b               idct16x16_256_add_neon_pass2
+    ENDP  ; |vpx_idct16x16_256_add_neon_pass2_tran_low|
+    ENDIF  ; CONFIG_VP9_HIGHBITDEPTH
+
+;void |vpx_idct16x16_10_add_neon_pass1|(const tran_low_t *input,
+;                                       int16_t *output)
+;
+; r0  const tran_low_t *input
 ; r1  int16_t *output
-; r2  int  output_stride)
 
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
@@ -781,14 +826,14 @@ end_idct16x16_pass2
 
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q1,q2}, [r0]!
+    LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
+    LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
+    LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
+    LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
+    LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
+    LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
+    LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
+    LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0
     vmov.s16        q15, q1
 
     ; cospi_28_64*2 = 6392
@@ -846,12 +891,12 @@ end_idct16x16_pass2
     vadd.s32        q10, q10, q12
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d11, q15, #14             ; >> 14
-    vqrshrn.s32     d10, q6, #14              ; >> 14
+    vrshrn.s32      d11, q15, #14             ; >> 14
+    vrshrn.s32      d10, q6, #14              ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
+    vrshrn.s32      d12, q9, #14              ; >> 14
+    vrshrn.s32      d13, q10, #14             ; >> 14
 
     ; stage 6
     vadd.s16        q2, q8, q7                ; step2[0] = step1[0] + step1[7];
@@ -864,39 +909,21 @@ end_idct16x16_pass2
     vsub.s16        q15, q8, q7               ; step2[7] = step1[0] - step1[7];
 
     ; store the data
-    vst1.64         {d4}, [r1], r2
-    vst1.64         {d5}, [r1], r2
-    vst1.64         {d18}, [r1], r2
-    vst1.64         {d19}, [r1], r2
-    vst1.64         {d20}, [r1], r2
-    vst1.64         {d21}, [r1], r2
-    vst1.64         {d22}, [r1], r2
-    vst1.64         {d23}, [r1], r2
-    vst1.64         {d24}, [r1], r2
-    vst1.64         {d25}, [r1], r2
-    vst1.64         {d26}, [r1], r2
-    vst1.64         {d27}, [r1], r2
-    vst1.64         {d28}, [r1], r2
-    vst1.64         {d29}, [r1], r2
-    vst1.64         {d30}, [r1], r2
-    vst1.64         {d31}, [r1], r2
+    vst1.64         {q2}, [r1]!
+    vst1.64         {q9-q10}, [r1]!
+    vst1.64         {q11-q12}, [r1]!
+    vst1.64         {q13-q14}, [r1]!
+    vst1.64         {q15}, [r1]
 
     bx              lr
     ENDP  ; |vpx_idct16x16_10_add_neon_pass1|
 
-;void vpx_idct16x16_10_add_neon_pass2(int16_t *src,
-;                                           int16_t *output,
-;                                           int16_t *pass1Output,
-;                                           int16_t skip_adding,
-;                                           uint8_t *dest,
-;                                           int dest_stride)
+;void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output,
+;                                     int16_t *pass1_output)
 ;
-; r0  int16_t *src
-; r1  int16_t *output,
-; r2  int16_t *pass1Output,
-; r3  int16_t skip_adding,
-; r4  uint8_t *dest,
-; r5  int dest_stride)
+; r0  const tran_low_t *src
+; r1  int16_t *output
+; r2  int16_t *pass1_output
 
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
@@ -906,14 +933,14 @@ end_idct16x16_pass2
 
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q0,q1}, [r0]!
+    LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
+    LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
+    LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
+    LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
+    LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
+    LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
+    LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
+    LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0
     vmov.s16        q15, q0;
 
     ; 2*cospi_30_64 = 3212
@@ -981,12 +1008,12 @@ end_idct16x16_pass2
     vdup.16          d30, r12                 ; duplicate -cospi_8_64
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q12, #14              ; >> 14
-    vqrshrn.s32     d3, q5, #14               ; >> 14
+    vrshrn.s32      d2, q12, #14              ; >> 14
+    vrshrn.s32      d3, q5, #14               ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q2, #14              ; >> 14
-    vqrshrn.s32     d13, q11, #14             ; >> 14
+    vrshrn.s32      d12, q2, #14              ; >> 14
+    vrshrn.s32      d13, q11, #14             ; >> 14
 
     ; - step1[13] * cospi_8_64
     vmull.s16       q10, d8, d30
@@ -1005,12 +1032,12 @@ end_idct16x16_pass2
     vmlal.s16       q9, d9, d31
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q10, #14              ; >> 14
-    vqrshrn.s32     d5, q13, #14              ; >> 14
+    vrshrn.s32      d4, q10, #14              ; >> 14
+    vrshrn.s32      d5, q13, #14              ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q8, #14              ; >> 14
-    vqrshrn.s32     d11, q9, #14              ; >> 14
+    vrshrn.s32      d10, q8, #14              ; >> 14
+    vrshrn.s32      d11, q9, #14              ; >> 14
 
     ; stage 5
     vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
@@ -1045,12 +1072,12 @@ end_idct16x16_pass2
     vadd.s32        q1, q4, q1
 
     ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q5, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
+    vrshrn.s32      d4, q5, #14               ; >> 14
+    vrshrn.s32      d5, q6, #14               ; >> 14
 
     ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q0, #14              ; >> 14
-    vqrshrn.s32     d11, q1, #14              ; >> 14
+    vrshrn.s32      d10, q0, #14              ; >> 14
+    vrshrn.s32      d11, q1, #14              ; >> 14
 
     ; step1[11] * cospi_16_64
     vmull.s16       q0, d22, d14
@@ -1069,14 +1096,14 @@ end_idct16x16_pass2
     vadd.s32        q6, q6, q1
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
+    vrshrn.s32      d6, q10, #14              ; >> 14
+    vrshrn.s32      d7, q4, #14               ; >> 14
 
     ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
-    vqrshrn.s32     d8, q13, #14              ; >> 14
-    vqrshrn.s32     d9, q6, #14               ; >> 14
+    vrshrn.s32      d8, q13, #14              ; >> 14
+    vrshrn.s32      d9, q6, #14               ; >> 14
 
-    mov              r4, #16                  ; pass1Output stride
+    mov              r4, #16                  ; pass1_output stride
     ldr              r3, [sp]                 ; load skip_adding
 
     ; stage 7
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
index f682afc7bf6..0c891919b76 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -10,1218 +10,813 @@
 
 #include <arm_neon.h>
 
-#include "./vpx_config.h"
-#include "vpx_dsp/arm/transpose_neon.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
-                                      int output_stride) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = vld2q_s16(in);
-  q8s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q9s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q10s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q11s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q12s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q13s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q14s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q15s16 = q0x2s16.val[0];
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-  d30s16 = vget_low_s16(q15s16);
-  d31s16 = vget_high_s16(q15s16);
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void idct16x16_256_add_load_tran_low_kernel(
+    const tran_low_t **input, int16_t **out) {
+  int16x8_t s;
 
-  // stage 3
-  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d18s16, d1s16);
-  q6s32 = vmull_s16(d19s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
-  q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
-
-  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
-  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
-
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q5s32, 14);
-  d15s16 = vqrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  q2s32 = vmull_s16(d26s16, d2s16);
-  q3s32 = vmull_s16(d27s16, d2s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q15s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
-  q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
-
-  d10s16 = vqrshrn_n_s32(q2s32, 14);
-  d11s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q15s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
+  s = load_tran_low_to_s16q(*input);
+  vst1q_s16(*out, s);
+  *input += 8;
+  *out += 8;
+}
 
-  // stage 4
-  d30s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d30s16);
-  q11s32 = vmull_s16(d17s16, d30s16);
-  q0s32 = vmull_s16(d24s16, d30s16);
-  q1s32 = vmull_s16(d25s16, d30s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_24_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_8_64);
-
-  q3s32 = vaddq_s32(q2s32, q0s32);
-  q12s32 = vaddq_s32(q11s32, q1s32);
-  q13s32 = vsubq_s32(q2s32, q0s32);
-  q1s32 = vsubq_s32(q11s32, q1s32);
-
-  d16s16 = vqrshrn_n_s32(q3s32, 14);
-  d17s16 = vqrshrn_n_s32(q12s32, 14);
-  d18s16 = vqrshrn_n_s32(q13s32, 14);
-  d19s16 = vqrshrn_n_s32(q1s32, 14);
-  q8s16 = vcombine_s16(d16s16, d17s16);
-  q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q0s32 = vmull_s16(d20s16, d31s16);
-  q1s32 = vmull_s16(d21s16, d31s16);
-  q12s32 = vmull_s16(d20s16, d30s16);
-  q13s32 = vmull_s16(d21s16, d30s16);
-
-  q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
-  q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
-  q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
-  q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
-
-  d22s16 = vqrshrn_n_s32(q0s32, 14);
-  d23s16 = vqrshrn_n_s32(q1s32, 14);
-  d20s16 = vqrshrn_n_s32(q12s32, 14);
-  d21s16 = vqrshrn_n_s32(q13s32, 14);
-  q10s16 = vcombine_s16(d20s16, d21s16);
-  q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q15s16 = vaddq_s16(q6s16, q7s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
+static INLINE void idct16x16_256_add_load_tran_low(const tran_low_t *input,
+                                                   int16_t *out) {
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+  idct16x16_256_add_load_tran_low_kernel(&input, &out);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  // stage 5
-  q0s16 = vaddq_s16(q8s16, q11s16);
-  q1s16 = vaddq_s16(q9s16, q10s16);
-  q2s16 = vsubq_s16(q9s16, q10s16);
-  q3s16 = vsubq_s16(q8s16, q11s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q11s32 = vmull_s16(d26s16, d16s16);
-  q12s32 = vmull_s16(d27s16, d16s16);
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-
-  q6s32 = vsubq_s32(q9s32, q11s32);
-  q13s32 = vsubq_s32(q10s32, q12s32);
-  q9s32 = vaddq_s32(q9s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q12s32);
-
-  d10s16 = vqrshrn_n_s32(q6s32, 14);
-  d11s16 = vqrshrn_n_s32(q13s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q10s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
+static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
+                                int16x4_t *const d1) {
+  *d0 = vrshrn_n_s32(t32[0], 14);
+  *d1 = vrshrn_n_s32(t32[1], 14);
+}
 
-  // stage 6
-  q8s16 = vaddq_s16(q0s16, q15s16);
-  q9s16 = vaddq_s16(q1s16, q6s16);
-  q10s16 = vaddq_s16(q2s16, q5s16);
-  q11s16 = vaddq_s16(q3s16, q4s16);
-  q12s16 = vsubq_s16(q3s16, q4s16);
-  q13s16 = vsubq_s16(q2s16, q5s16);
-  q14s16 = vsubq_s16(q1s16, q6s16);
-  q15s16 = vsubq_s16(q0s16, q15s16);
-
-  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  // store the data
-  output_stride >>= 1;  // output_stride / 2, out is int16_t
-  vst1_u64((uint64_t *)out, d16u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d17u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d20u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d21u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d22u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d23u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d24u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d31u64);
+static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
+                                   const int16x4_t cospi_2_30_10_22,
+                                   int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[6];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
 }
 
-void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
-                                      int16_t *pass1Output, int16_t skip_adding,
-                                      uint8_t *dest, int dest_stride) {
-  uint8_t *d;
-  uint8x8_t d12u8, d13u8;
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64;
-  int64x1_t d12s64, d13s64;
-  uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
-  uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = vld2q_s16(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q15s16 = q0x2s16.val[0];
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-  d30s16 = vget_low_s16(q15s16);
-  d31s16 = vget_high_s16(q15s16);
+static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
+                                   const int16x4_t cospi_4_12_20N_28,
+                                   int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[6];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
 
-  // stage 3
-  d12s16 = vdup_n_s16((int16_t)cospi_30_64);
-  d13s16 = vdup_n_s16((int16_t)cospi_2_64);
-
-  q2s32 = vmull_s16(d16s16, d12s16);
-  q3s32 = vmull_s16(d17s16, d12s16);
-  q1s32 = vmull_s16(d16s16, d13s16);
-  q4s32 = vmull_s16(d17s16, d13s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
-  q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
-  q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
-
-  d0s16 = vqrshrn_n_s32(q2s32, 14);
-  d1s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q1s32, 14);
-  d15s16 = vqrshrn_n_s32(q4s32, 14);
-  q0s16 = vcombine_s16(d0s16, d1s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_14_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_18_64);
-
-  q2s32 = vmull_s16(d24s16, d30s16);
-  q3s32 = vmull_s16(d25s16, d30s16);
-  q4s32 = vmull_s16(d24s16, d31s16);
-  q5s32 = vmull_s16(d25s16, d31s16);
-
-  q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
-  q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
-  q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
-  q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
-
-  d2s16 = vqrshrn_n_s32(q2s32, 14);
-  d3s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q4s32, 14);
-  d13s16 = vqrshrn_n_s32(q5s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_22_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_10_64);
-
-  q11s32 = vmull_s16(d20s16, d30s16);
-  q12s32 = vmull_s16(d21s16, d30s16);
-  q4s32 = vmull_s16(d20s16, d31s16);
-  q5s32 = vmull_s16(d21s16, d31s16);
-
-  q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
-  q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
-  q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
-  q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
-
-  d4s16 = vqrshrn_n_s32(q11s32, 14);
-  d5s16 = vqrshrn_n_s32(q12s32, 14);
-  d11s16 = vqrshrn_n_s32(q5s32, 14);
-  d10s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_6_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_26_64);
-
-  q10s32 = vmull_s16(d28s16, d30s16);
-  q11s32 = vmull_s16(d29s16, d30s16);
-  q12s32 = vmull_s16(d28s16, d31s16);
-  q13s32 = vmull_s16(d29s16, d31s16);
-
-  q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
-  q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
-  q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
-  q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q11s32, 14);
-  d8s16 = vqrshrn_n_s32(q12s32, 14);
-  d9s16 = vqrshrn_n_s32(q13s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
+static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
+                                   const int16x4_t cospi_6_26_14_18N,
+                                   int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[6];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 0);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 0);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 0);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 0);
+  t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 1);
+  t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 1);
+  t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 1);
+  t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 1);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
+                                            const int16x4_t s1,
+                                            const int16x4_t cospi_0_8_16_24,
+                                            int32x4_t *const t32) {
+  t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
+  t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
+  t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
+}
+
+static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
+                                     const int16x4_t cospi_0_8_16_24,
+                                     int16x4_t *const d0, int16x4_t *const d1) {
+  int32x4_t t32[2];
+
+  idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+  wrap_low_4x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
+                                         const int16x4_t cospi_0_8_16_24,
+                                         int16x4_t *const d0,
+                                         int16x4_t *const d1) {
+  int32x4_t t32[2];
+
+  idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+  t32[1] = vnegq_s32(t32[1]);
+  wrap_low_4x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x4_t cospi_2_30_10_22,
+                                    int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[6];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x4_t cospi_4_12_20N_28,
+                                    int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[6];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
+  t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
+  t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
+  t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
+  t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x4_t cospi_6_26_14_18N,
+                                    int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[6];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 2);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 2);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 2);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 2);
+  t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 3);
+  t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 3);
+  t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 3);
+  t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 3);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t cospi_0_8_16_24,
+                                      int16x4_t *const d0,
+                                      int16x4_t *const d1) {
+  int32x4_t t32[3];
+
+  t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
+  t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+  t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+  wrap_low_4x2(t32, d0, d1);
+}
+
+static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output,
+                                     uint8_t *dest, int stride) {
+  const int16x8_t cospis0 = vld1q_s16(kCospi);
+  const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+  const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+  const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
+  const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
+  const int16x4_t cospi_6_26_14_18N = vget_high_s16(cospis1);
+  int16x8_t in[16], step1[16], step2[16], out[16];
+
+  // Load input (16x8)
+  in[0] = vld1q_s16(input);
+  input += 8;
+  in[8] = vld1q_s16(input);
+  input += 8;
+  in[1] = vld1q_s16(input);
+  input += 8;
+  in[9] = vld1q_s16(input);
+  input += 8;
+  in[2] = vld1q_s16(input);
+  input += 8;
+  in[10] = vld1q_s16(input);
+  input += 8;
+  in[3] = vld1q_s16(input);
+  input += 8;
+  in[11] = vld1q_s16(input);
+  input += 8;
+  in[4] = vld1q_s16(input);
+  input += 8;
+  in[12] = vld1q_s16(input);
+  input += 8;
+  in[5] = vld1q_s16(input);
+  input += 8;
+  in[13] = vld1q_s16(input);
+  input += 8;
+  in[6] = vld1q_s16(input);
+  input += 8;
+  in[14] = vld1q_s16(input);
+  input += 8;
+  in[7] = vld1q_s16(input);
+  input += 8;
+  in[15] = vld1q_s16(input);
+
+  // Transpose
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+  transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+                    &in[15]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[1] = in[16 / 2];
+  step1[2] = in[8 / 2];
+  step1[3] = in[24 / 2];
+  step1[4] = in[4 / 2];
+  step1[5] = in[20 / 2];
+  step1[6] = in[12 / 2];
+  step1[7] = in[28 / 2];
+  step1[8] = in[2 / 2];
+  step1[9] = in[18 / 2];
+  step1[10] = in[10 / 2];
+  step1[11] = in[26 / 2];
+  step1[12] = in[6 / 2];
+  step1[13] = in[22 / 2];
+  step1[14] = in[14 / 2];
+  step1[15] = in[30 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+  idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
+  idct_cospi_14_18(step1[9], step1[14], cospi_6_26_14_18N, &step2[9],
+                   &step2[14]);
+  idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
+                   &step2[13]);
+  idct_cospi_6_26(step1[11], step1[12], cospi_6_26_14_18N, &step2[11],
+                  &step2[12]);
 
   // stage 3
-  q9s16 = vsubq_s16(q0s16, q1s16);
-  q0s16 = vaddq_s16(q0s16, q1s16);
-  q10s16 = vsubq_s16(q3s16, q2s16);
-  q11s16 = vaddq_s16(q2s16, q3s16);
-  q12s16 = vaddq_s16(q4s16, q5s16);
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q6s16, q7s16);
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
+  idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
+  step1[8] = vaddq_s16(step2[8], step2[9]);
+  step1[9] = vsubq_s16(step2[8], step2[9]);
+  step1[10] = vsubq_s16(step2[11], step2[10]);
+  step1[11] = vaddq_s16(step2[11], step2[10]);
+  step1[12] = vaddq_s16(step2[12], step2[13]);
+  step1[13] = vsubq_s16(step2[12], step2[13]);
+  step1[14] = vsubq_s16(step2[15], step2[14]);
+  step1[15] = vaddq_s16(step2[15], step2[14]);
 
   // stage 4
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  q2s32 = vmull_s16(d18s16, d31s16);
-  q3s32 = vmull_s16(d19s16, d31s16);
-  q4s32 = vmull_s16(d28s16, d31s16);
-  q5s32 = vmull_s16(d29s16, d31s16);
-
-  q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
-  q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
-  q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
-  q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
-
-  d12s16 = vqrshrn_n_s32(q2s32, 14);
-  d13s16 = vqrshrn_n_s32(q3s32, 14);
-  d2s16 = vqrshrn_n_s32(q4s32, 14);
-  d3s16 = vqrshrn_n_s32(q5s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  q3s16 = q11s16;
-  q4s16 = q12s16;
-
-  d30s16 = vdup_n_s16(-cospi_8_64);
-  q11s32 = vmull_s16(d26s16, d30s16);
-  q12s32 = vmull_s16(d27s16, d30s16);
-  q8s32 = vmull_s16(d20s16, d30s16);
-  q9s32 = vmull_s16(d21s16, d30s16);
-
-  q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
-  q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
-  q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
-  q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
-
-  d4s16 = vqrshrn_n_s32(q11s32, 14);
-  d5s16 = vqrshrn_n_s32(q12s32, 14);
-  d10s16 = vqrshrn_n_s32(q8s32, 14);
-  d11s16 = vqrshrn_n_s32(q9s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
+  idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
+  idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
+  step2[4] = vaddq_s16(step1[4], step1[5]);
+  step2[5] = vsubq_s16(step1[4], step1[5]);
+  step2[6] = vsubq_s16(step1[7], step1[6]);
+  step2[7] = vaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                    &step2[14]);
+  idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+                        &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
 
   // stage 5
-  q8s16 = vaddq_s16(q0s16, q3s16);
-  q9s16 = vaddq_s16(q1s16, q2s16);
-  q10s16 = vsubq_s16(q1s16, q2s16);
-  q11s16 = vsubq_s16(q0s16, q3s16);
-  q12s16 = vsubq_s16(q7s16, q4s16);
-  q13s16 = vsubq_s16(q6s16, q5s16);
-  q14s16 = vaddq_s16(q6s16, q5s16);
-  q15s16 = vaddq_s16(q7s16, q4s16);
+  step1[0] = vaddq_s16(step2[0], step2[3]);
+  step1[1] = vaddq_s16(step2[1], step2[2]);
+  step1[2] = vsubq_s16(step2[1], step2[2]);
+  step1[3] = vsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vaddq_s16(step2[8], step2[11]);
+  step1[9] = vaddq_s16(step2[9], step2[10]);
+  step1[10] = vsubq_s16(step2[9], step2[10]);
+  step1[11] = vsubq_s16(step2[8], step2[11]);
+  step1[12] = vsubq_s16(step2[15], step2[12]);
+  step1[13] = vsubq_s16(step2[14], step2[13]);
+  step1[14] = vaddq_s16(step2[14], step2[13]);
+  step1[15] = vaddq_s16(step2[15], step2[12]);
 
   // stage 6
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-
-  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q3s32 = vmull_s16(d26s16, d14s16);
-  q4s32 = vmull_s16(d27s16, d14s16);
-  q0s32 = vmull_s16(d20s16, d14s16);
-  q1s32 = vmull_s16(d21s16, d14s16);
-
-  q5s32 = vsubq_s32(q3s32, q0s32);
-  q6s32 = vsubq_s32(q4s32, q1s32);
-  q10s32 = vaddq_s32(q3s32, q0s32);
-  q4s32 = vaddq_s32(q4s32, q1s32);
-
-  d4s16 = vqrshrn_n_s32(q5s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
-  d10s16 = vqrshrn_n_s32(q10s32, 14);
-  d11s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q0s32 = vmull_s16(d22s16, d14s16);
-  q1s32 = vmull_s16(d23s16, d14s16);
-  q13s32 = vmull_s16(d24s16, d14s16);
-  q6s32 = vmull_s16(d25s16, d14s16);
-
-  q10s32 = vsubq_s32(q13s32, q0s32);
-  q4s32 = vsubq_s32(q6s32, q1s32);
-  q13s32 = vaddq_s32(q13s32, q0s32);
-  q6s32 = vaddq_s32(q6s32, q1s32);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
-  d8s16 = vqrshrn_n_s32(q13s32, 14);
-  d9s16 = vqrshrn_n_s32(q6s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
+  step2[0] = vaddq_s16(step1[0], step1[7]);
+  step2[1] = vaddq_s16(step1[1], step1[6]);
+  step2[2] = vaddq_s16(step1[2], step1[5]);
+  step2[3] = vaddq_s16(step1[3], step1[4]);
+  step2[4] = vsubq_s16(step1[3], step1[4]);
+  step2[5] = vsubq_s16(step1[2], step1[5]);
+  step2[6] = vsubq_s16(step1[1], step1[6]);
+  step2[7] = vsubq_s16(step1[0], step1[7]);
+  idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                     &step2[13]);
+  idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                     &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
 
   // stage 7
-  if (skip_adding != 0) {
-    d = dest;
-    // load the data in pass1
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-
-    q12s16 = vaddq_s16(q0s16, q15s16);
-    q13s16 = vaddq_s16(q1s16, q14s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q14s16 = vsubq_s16(q1s16, q14s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q10s16, q5s16);
-    q13s16 = vaddq_s16(q11s16, q4s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q4s16 = vsubq_s16(q11s16, q4s16);
-    q5s16 = vsubq_s16(q10s16, q5s16);
-
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q0s16, q3s16);
-    q13s16 = vaddq_s16(q1s16, q2s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q2s16 = vsubq_s16(q1s16, q2s16);
-    q3s16 = vsubq_s16(q0s16, q3s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q10s16, q9s16);
-    q13s16 = vaddq_s16(q11s16, q8s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q8s16 = vsubq_s16(q11s16, q8s16);
-    q9s16 = vsubq_s16(q10s16, q9s16);
-
-    // store the data  out 8,9,10,11,12,13,14,15
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q8s16 = vrshrq_n_s16(q8s16, 6);
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q9s16 = vrshrq_n_s16(q9s16, 6);
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q2s16 = vrshrq_n_s16(q2s16, 6);
-    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q3s16 = vrshrq_n_s16(q3s16, 6);
-    q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q4s16 = vrshrq_n_s16(q4s16, 6);
-    q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q5s16 = vrshrq_n_s16(q5s16, 6);
-    q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q14s16 = vrshrq_n_s16(q14s16, 6);
-    q14u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    q15s16 = vrshrq_n_s16(q15s16, 6);
-    q15u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-  } else {  // skip_adding_dest
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q0s16, q15s16);
-    q13s16 = vaddq_s16(q1s16, q14s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q14s16 = vsubq_s16(q1s16, q14s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q10s16, q5s16);
-    q13s16 = vaddq_s16(q11s16, q4s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q4s16 = vsubq_s16(q11s16, q4s16);
-    q5s16 = vsubq_s16(q10s16, q5s16);
-
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q0s16, q3s16);
-    q13s16 = vaddq_s16(q1s16, q2s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q2s16 = vsubq_s16(q1s16, q2s16);
-    q3s16 = vsubq_s16(q0s16, q3s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q10s16, q9s16);
-    q13s16 = vaddq_s16(q11s16, q8s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q8s16 = vsubq_s16(q11s16, q8s16);
-    q9s16 = vsubq_s16(q10s16, q9s16);
-
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+  out[0] = vaddq_s16(step2[0], step2[15]);
+  out[1] = vaddq_s16(step2[1], step2[14]);
+  out[2] = vaddq_s16(step2[2], step2[13]);
+  out[3] = vaddq_s16(step2[3], step2[12]);
+  out[4] = vaddq_s16(step2[4], step2[11]);
+  out[5] = vaddq_s16(step2[5], step2[10]);
+  out[6] = vaddq_s16(step2[6], step2[9]);
+  out[7] = vaddq_s16(step2[7], step2[8]);
+  out[8] = vsubq_s16(step2[7], step2[8]);
+  out[9] = vsubq_s16(step2[6], step2[9]);
+  out[10] = vsubq_s16(step2[5], step2[10]);
+  out[11] = vsubq_s16(step2[4], step2[11]);
+  out[12] = vsubq_s16(step2[3], step2[12]);
+  out[13] = vsubq_s16(step2[2], step2[13]);
+  out[14] = vsubq_s16(step2[1], step2[14]);
+  out[15] = vsubq_s16(step2[0], step2[15]);
+
+  if (output) {
+    // pass 1: save the result into output
+    vst1q_s16(output, out[0]);
+    output += 16;
+    vst1q_s16(output, out[1]);
+    output += 16;
+    vst1q_s16(output, out[2]);
+    output += 16;
+    vst1q_s16(output, out[3]);
+    output += 16;
+    vst1q_s16(output, out[4]);
+    output += 16;
+    vst1q_s16(output, out[5]);
+    output += 16;
+    vst1q_s16(output, out[6]);
+    output += 16;
+    vst1q_s16(output, out[7]);
+    output += 16;
+    vst1q_s16(output, out[8]);
+    output += 16;
+    vst1q_s16(output, out[9]);
+    output += 16;
+    vst1q_s16(output, out[10]);
+    output += 16;
+    vst1q_s16(output, out[11]);
+    output += 16;
+    vst1q_s16(output, out[12]);
+    output += 16;
+    vst1q_s16(output, out[13]);
+    output += 16;
+    vst1q_s16(output, out[14]);
+    output += 16;
+    vst1q_s16(output, out[15]);
+  } else {
+    // pass 2: add the result to dest.
+    idct16x16_add8x1(out[0], &dest, stride);
+    idct16x16_add8x1(out[1], &dest, stride);
+    idct16x16_add8x1(out[2], &dest, stride);
+    idct16x16_add8x1(out[3], &dest, stride);
+    idct16x16_add8x1(out[4], &dest, stride);
+    idct16x16_add8x1(out[5], &dest, stride);
+    idct16x16_add8x1(out[6], &dest, stride);
+    idct16x16_add8x1(out[7], &dest, stride);
+    idct16x16_add8x1(out[8], &dest, stride);
+    idct16x16_add8x1(out[9], &dest, stride);
+    idct16x16_add8x1(out[10], &dest, stride);
+    idct16x16_add8x1(out[11], &dest, stride);
+    idct16x16_add8x1(out[12], &dest, stride);
+    idct16x16_add8x1(out[13], &dest, stride);
+    idct16x16_add8x1(out[14], &dest, stride);
+    idct16x16_add8x1(out[15], &dest, stride);
   }
 }
 
-void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
-                                     int output_stride) {
-  int16x4_t d4s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q6s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q15s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = vld2q_s16(in);
-  q8s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q9s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q10s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q11s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q12s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q13s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q14s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q15s16 = q0x2s16.val[0];
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
+static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+                                          int16_t *output) {
+  const int16x8_t cospis0 = vld1q_s16(kCospi);
+  const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+  const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+  const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+  const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+  const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+  const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+  const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+  const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+  int16x4_t in[4], step1[16], step2[16], out[16];
+
+// Load input (4x4)
+#if CONFIG_VP9_HIGHBITDEPTH
+  in[0] = load_tran_low_to_s16d(input);
+  input += 16;
+  in[1] = load_tran_low_to_s16d(input);
+  input += 16;
+  in[2] = load_tran_low_to_s16d(input);
+  input += 16;
+  in[3] = load_tran_low_to_s16d(input);
+#else
+  in[0] = vld1_s16(input);
+  input += 16;
+  in[1] = vld1_s16(input);
+  input += 16;
+  in[2] = vld1_s16(input);
+  input += 16;
+  in[3] = vld1_s16(input);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Transpose
+  transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[4] = in[4 / 2];
+  step1[8] = in[2 / 2];
+  step1[12] = in[6 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+  step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
+  step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+  step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+  step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
 
   // stage 3
-  q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
-  q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
-
-  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+  step1[0] = step2[0];
+  step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+  step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
 
   // stage 4
-  q1s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
-  d4s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q8s16 = vqrdmulhq_s16(q8s16, q1s16);
-
-  d8s16 = vget_low_s16(q4s16);
-  d9s16 = vget_high_s16(q4s16);
-  d14s16 = vget_low_s16(q7s16);
-  d15s16 = vget_high_s16(q7s16);
-  q9s32 = vmull_s16(d14s16, d4s16);
-  q10s32 = vmull_s16(d15s16, d4s16);
-  q12s32 = vmull_s16(d9s16, d4s16);
-  q11s32 = vmull_s16(d8s16, d4s16);
-
-  q15s32 = vsubq_s32(q10s32, q12s32);
-  q6s32 = vsubq_s32(q9s32, q11s32);
-  q9s32 = vaddq_s32(q9s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q12s32);
-
-  d11s16 = vqrshrn_n_s32(q15s32, 14);
-  d10s16 = vqrshrn_n_s32(q6s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q10s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
+  step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                    &step2[14]);
+  idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+                        &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vadd_s16(step2[8], step2[11]);
+  step1[9] = vadd_s16(step2[9], step2[10]);
+  step1[10] = vsub_s16(step2[9], step2[10]);
+  step1[11] = vsub_s16(step2[8], step2[11]);
+  step1[12] = vsub_s16(step2[15], step2[12]);
+  step1[13] = vsub_s16(step2[14], step2[13]);
+  step1[14] = vadd_s16(step2[14], step2[13]);
+  step1[15] = vadd_s16(step2[15], step2[12]);
 
   // stage 6
-  q2s16 = vaddq_s16(q8s16, q7s16);
-  q9s16 = vaddq_s16(q8s16, q6s16);
-  q10s16 = vaddq_s16(q8s16, q5s16);
-  q11s16 = vaddq_s16(q8s16, q4s16);
-  q12s16 = vsubq_s16(q8s16, q4s16);
-  q13s16 = vsubq_s16(q8s16, q5s16);
-  q14s16 = vsubq_s16(q8s16, q6s16);
-  q15s16 = vsubq_s16(q8s16, q7s16);
-
-  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
-  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  // store the data
-  output_stride >>= 1;  // output_stride / 2, out is int16_t
-  vst1_u64((uint64_t *)out, d4u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d5u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d20u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d21u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d22u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d23u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d24u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d31u64);
+  step2[0] = vadd_s16(step1[0], step1[7]);
+  step2[1] = vadd_s16(step1[1], step1[6]);
+  step2[2] = vadd_s16(step1[2], step1[5]);
+  step2[3] = vadd_s16(step1[3], step1[4]);
+  step2[4] = vsub_s16(step1[3], step1[4]);
+  step2[5] = vsub_s16(step1[2], step1[5]);
+  step2[6] = vsub_s16(step1[1], step1[6]);
+  step2[7] = vsub_s16(step1[0], step1[7]);
+  idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                     &step2[13]);
+  idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                     &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  out[0] = vadd_s16(step2[0], step2[15]);
+  out[1] = vadd_s16(step2[1], step2[14]);
+  out[2] = vadd_s16(step2[2], step2[13]);
+  out[3] = vadd_s16(step2[3], step2[12]);
+  out[4] = vadd_s16(step2[4], step2[11]);
+  out[5] = vadd_s16(step2[5], step2[10]);
+  out[6] = vadd_s16(step2[6], step2[9]);
+  out[7] = vadd_s16(step2[7], step2[8]);
+  out[8] = vsub_s16(step2[7], step2[8]);
+  out[9] = vsub_s16(step2[6], step2[9]);
+  out[10] = vsub_s16(step2[5], step2[10]);
+  out[11] = vsub_s16(step2[4], step2[11]);
+  out[12] = vsub_s16(step2[3], step2[12]);
+  out[13] = vsub_s16(step2[2], step2[13]);
+  out[14] = vsub_s16(step2[1], step2[14]);
+  out[15] = vsub_s16(step2[0], step2[15]);
+
+  // pass 1: save the result into output
+  vst1_s16(output, out[0]);
+  output += 4;
+  vst1_s16(output, out[1]);
+  output += 4;
+  vst1_s16(output, out[2]);
+  output += 4;
+  vst1_s16(output, out[3]);
+  output += 4;
+  vst1_s16(output, out[4]);
+  output += 4;
+  vst1_s16(output, out[5]);
+  output += 4;
+  vst1_s16(output, out[6]);
+  output += 4;
+  vst1_s16(output, out[7]);
+  output += 4;
+  vst1_s16(output, out[8]);
+  output += 4;
+  vst1_s16(output, out[9]);
+  output += 4;
+  vst1_s16(output, out[10]);
+  output += 4;
+  vst1_s16(output, out[11]);
+  output += 4;
+  vst1_s16(output, out[12]);
+  output += 4;
+  vst1_s16(output, out[13]);
+  output += 4;
+  vst1_s16(output, out[14]);
+  output += 4;
+  vst1_s16(output, out[15]);
 }
 
-void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
-                                     int16_t *pass1Output, int16_t skip_adding,
-                                     uint8_t *dest, int dest_stride) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
-  uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
-  uint64x1_t d16u64, d17u64, d18u64, d19u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32;
-  int16x8x2_t q0x2s16;
-  (void)skip_adding;
-  (void)dest;
-  (void)dest_stride;
-
-  q0x2s16 = vld2q_s16(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q15s16 = q0x2s16.val[0];
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
+static void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *output,
+                                          uint8_t *dest, int stride) {
+  const int16x8_t cospis0 = vld1q_s16(kCospi);
+  const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+  const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+  const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+  const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+  const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+  const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+  const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+  const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+  int16x4_t ind[8];
+  int16x8_t in[4], step1[16], step2[16], out[16];
+
+  // Load input (4x8)
+  ind[0] = vld1_s16(input);
+  input += 4;
+  ind[1] = vld1_s16(input);
+  input += 4;
+  ind[2] = vld1_s16(input);
+  input += 4;
+  ind[3] = vld1_s16(input);
+  input += 4;
+  ind[4] = vld1_s16(input);
+  input += 4;
+  ind[5] = vld1_s16(input);
+  input += 4;
+  ind[6] = vld1_s16(input);
+  input += 4;
+  ind[7] = vld1_s16(input);
+
+  // Transpose
+  transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
+                    ind[7], &in[0], &in[1], &in[2], &in[3]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[4] = in[4 / 2];
+  step1[8] = in[2 / 2];
+  step1[12] = in[6 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+  step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
+  step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+  step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+  step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
 
   // stage 3
-  q6s16 = vdupq_n_s16((int16_t)cospi_30_64 * 2);
-  q0s16 = vqrdmulhq_s16(q8s16, q6s16);
-  q6s16 = vdupq_n_s16((int16_t)cospi_2_64 * 2);
-  q7s16 = vqrdmulhq_s16(q8s16, q6s16);
-
-  q15s16 = vdupq_n_s16((int16_t)-cospi_26_64 * 2);
-  q14s16 = vdupq_n_s16((int16_t)cospi_6_64 * 2);
-  q3s16 = vqrdmulhq_s16(q9s16, q15s16);
-  q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+  step1[0] = step2[0];
+  step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+  step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
 
   // stage 4
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-  d6s16 = vget_low_s16(q3s16);
-  d7s16 = vget_high_s16(q3s16);
-  d8s16 = vget_low_s16(q4s16);
-  d9s16 = vget_high_s16(q4s16);
-  d14s16 = vget_low_s16(q7s16);
-  d15s16 = vget_high_s16(q7s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  q12s32 = vmull_s16(d14s16, d31s16);
-  q5s32 = vmull_s16(d15s16, d31s16);
-  q2s32 = vmull_s16(d0s16, d31s16);
-  q11s32 = vmull_s16(d1s16, d31s16);
-
-  q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
-  q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
-  q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
-  q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
-
-  d2s16 = vqrshrn_n_s32(q12s32, 14);
-  d3s16 = vqrshrn_n_s32(q5s32, 14);
-  d12s16 = vqrshrn_n_s32(q2s32, 14);
-  d13s16 = vqrshrn_n_s32(q11s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  d30s16 = vdup_n_s16(-cospi_8_64);
-  q10s32 = vmull_s16(d8s16, d30s16);
-  q13s32 = vmull_s16(d9s16, d30s16);
-  q8s32 = vmull_s16(d6s16, d30s16);
-  q9s32 = vmull_s16(d7s16, d30s16);
-
-  q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
-  q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
-  q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
-  q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
-
-  d4s16 = vqrshrn_n_s32(q10s32, 14);
-  d5s16 = vqrshrn_n_s32(q13s32, 14);
-  d10s16 = vqrshrn_n_s32(q8s32, 14);
-  d11s16 = vqrshrn_n_s32(q9s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
+  step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                    &step2[14]);
+  idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+                        &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
 
   // stage 5
-  q8s16 = vaddq_s16(q0s16, q3s16);
-  q9s16 = vaddq_s16(q1s16, q2s16);
-  q10s16 = vsubq_s16(q1s16, q2s16);
-  q11s16 = vsubq_s16(q0s16, q3s16);
-  q12s16 = vsubq_s16(q7s16, q4s16);
-  q13s16 = vsubq_s16(q6s16, q5s16);
-  q14s16 = vaddq_s16(q6s16, q5s16);
-  q15s16 = vaddq_s16(q7s16, q4s16);
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vaddq_s16(step2[8], step2[11]);
+  step1[9] = vaddq_s16(step2[9], step2[10]);
+  step1[10] = vsubq_s16(step2[9], step2[10]);
+  step1[11] = vsubq_s16(step2[8], step2[11]);
+  step1[12] = vsubq_s16(step2[15], step2[12]);
+  step1[13] = vsubq_s16(step2[14], step2[13]);
+  step1[14] = vaddq_s16(step2[14], step2[13]);
+  step1[15] = vaddq_s16(step2[15], step2[12]);
 
   // stage 6
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-
-  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
-  q3s32 = vmull_s16(d26s16, d14s16);
-  q4s32 = vmull_s16(d27s16, d14s16);
-  q0s32 = vmull_s16(d20s16, d14s16);
-  q1s32 = vmull_s16(d21s16, d14s16);
-
-  q5s32 = vsubq_s32(q3s32, q0s32);
-  q6s32 = vsubq_s32(q4s32, q1s32);
-  q0s32 = vaddq_s32(q3s32, q0s32);
-  q4s32 = vaddq_s32(q4s32, q1s32);
-
-  d4s16 = vqrshrn_n_s32(q5s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
-  d10s16 = vqrshrn_n_s32(q0s32, 14);
-  d11s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q0s32 = vmull_s16(d22s16, d14s16);
-  q1s32 = vmull_s16(d23s16, d14s16);
-  q13s32 = vmull_s16(d24s16, d14s16);
-  q6s32 = vmull_s16(d25s16, d14s16);
-
-  q10s32 = vsubq_s32(q13s32, q0s32);
-  q4s32 = vsubq_s32(q6s32, q1s32);
-  q13s32 = vaddq_s32(q13s32, q0s32);
-  q6s32 = vaddq_s32(q6s32, q1s32);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
-  d8s16 = vqrshrn_n_s32(q13s32, 14);
-  d9s16 = vqrshrn_n_s32(q6s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
+  step2[0] = vaddq_s16(step1[0], step1[7]);
+  step2[1] = vaddq_s16(step1[1], step1[6]);
+  step2[2] = vaddq_s16(step1[2], step1[5]);
+  step2[3] = vaddq_s16(step1[3], step1[4]);
+  step2[4] = vsubq_s16(step1[3], step1[4]);
+  step2[5] = vsubq_s16(step1[2], step1[5]);
+  step2[6] = vsubq_s16(step1[1], step1[6]);
+  step2[7] = vsubq_s16(step1[0], step1[7]);
+  idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                     &step2[13]);
+  idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                     &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
 
   // stage 7
-  q0s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q1s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q0s16, q15s16);
-  q13s16 = vaddq_s16(q1s16, q14s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q14s16 = vsubq_s16(q1s16, q14s16);
-  q15s16 = vsubq_s16(q0s16, q15s16);
-
-  q10s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q11s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q10s16, q5s16);
-  q13s16 = vaddq_s16(q11s16, q4s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q4s16 = vsubq_s16(q11s16, q4s16);
-  q5s16 = vsubq_s16(q10s16, q5s16);
-
-  q0s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q1s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q0s16, q3s16);
-  q13s16 = vaddq_s16(q1s16, q2s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q2s16 = vsubq_s16(q1s16, q2s16);
-  q3s16 = vsubq_s16(q0s16, q3s16);
-
-  q10s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q11s16 = vld1q_s16(pass1Output);
-  q12s16 = vaddq_s16(q10s16, q9s16);
-  q13s16 = vaddq_s16(q11s16, q8s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q8s16 = vsubq_s16(q11s16, q8s16);
-  q9s16 = vsubq_s16(q10s16, q9s16);
-
-  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
-  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
-  d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
-  d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
-  d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
-  d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
-  d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
-  d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
-  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  vst1_u64((uint64_t *)out, d16u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d17u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d4u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d5u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d6u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d7u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d8u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d9u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d10u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d11u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d31u64);
+  out[0] = vaddq_s16(step2[0], step2[15]);
+  out[1] = vaddq_s16(step2[1], step2[14]);
+  out[2] = vaddq_s16(step2[2], step2[13]);
+  out[3] = vaddq_s16(step2[3], step2[12]);
+  out[4] = vaddq_s16(step2[4], step2[11]);
+  out[5] = vaddq_s16(step2[5], step2[10]);
+  out[6] = vaddq_s16(step2[6], step2[9]);
+  out[7] = vaddq_s16(step2[7], step2[8]);
+  out[8] = vsubq_s16(step2[7], step2[8]);
+  out[9] = vsubq_s16(step2[6], step2[9]);
+  out[10] = vsubq_s16(step2[5], step2[10]);
+  out[11] = vsubq_s16(step2[4], step2[11]);
+  out[12] = vsubq_s16(step2[3], step2[12]);
+  out[13] = vsubq_s16(step2[2], step2[13]);
+  out[14] = vsubq_s16(step2[1], step2[14]);
+  out[15] = vsubq_s16(step2[0], step2[15]);
+
+  if (output) {
+    // pass 1: save the result into output
+    vst1q_s16(output, out[0]);
+    output += 16;
+    vst1q_s16(output, out[1]);
+    output += 16;
+    vst1q_s16(output, out[2]);
+    output += 16;
+    vst1q_s16(output, out[3]);
+    output += 16;
+    vst1q_s16(output, out[4]);
+    output += 16;
+    vst1q_s16(output, out[5]);
+    output += 16;
+    vst1q_s16(output, out[6]);
+    output += 16;
+    vst1q_s16(output, out[7]);
+    output += 16;
+    vst1q_s16(output, out[8]);
+    output += 16;
+    vst1q_s16(output, out[9]);
+    output += 16;
+    vst1q_s16(output, out[10]);
+    output += 16;
+    vst1q_s16(output, out[11]);
+    output += 16;
+    vst1q_s16(output, out[12]);
+    output += 16;
+    vst1q_s16(output, out[13]);
+    output += 16;
+    vst1q_s16(output, out[14]);
+    output += 16;
+    vst1q_s16(output, out[15]);
+  } else {
+    // pass 2: add the result to dest.
+    idct16x16_add8x1(out[0], &dest, stride);
+    idct16x16_add8x1(out[1], &dest, stride);
+    idct16x16_add8x1(out[2], &dest, stride);
+    idct16x16_add8x1(out[3], &dest, stride);
+    idct16x16_add8x1(out[4], &dest, stride);
+    idct16x16_add8x1(out[5], &dest, stride);
+    idct16x16_add8x1(out[6], &dest, stride);
+    idct16x16_add8x1(out[7], &dest, stride);
+    idct16x16_add8x1(out[8], &dest, stride);
+    idct16x16_add8x1(out[9], &dest, stride);
+    idct16x16_add8x1(out[10], &dest, stride);
+    idct16x16_add8x1(out[11], &dest, stride);
+    idct16x16_add8x1(out[12], &dest, stride);
+    idct16x16_add8x1(out[13], &dest, stride);
+    idct16x16_add8x1(out[14], &dest, stride);
+    idct16x16_add8x1(out[15], &dest, stride);
+  }
+}
+
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int16_t row_idct_output[16 * 16];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  int16_t pass1_input[16 * 16];
+  idct16x16_256_add_load_tran_low(input, pass1_input);
+#else
+  const int16_t *pass1_input = input;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // pass 1
+  // Parallel idct on the upper 8 rows
+  idct16x16_256_add_half1d(pass1_input, row_idct_output, dest, stride);
+
+  // Parallel idct on the lower 8 rows
+  idct16x16_256_add_half1d(pass1_input + 8 * 16, row_idct_output + 8, dest,
+                           stride);
+
+  // pass 2
+  // Parallel idct to get the left 8 columns
+  idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride);
+
+  // Parallel idct to get the right 8 columns
+  idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride);
+}
+
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  int16_t row_idct_output[4 * 16];
+
+  // pass 1
+  // Parallel idct on the upper 8 rows
+  idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+  // pass 2
+  // Parallel idct to get the left 8 columns
+  idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride);
+
+  // Parallel idct to get the right 8 columns
+  idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
+                                stride);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c
index bdbbf519332..47366bcb7d6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c
@@ -11,16 +11,29 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
-                                      int output_stride);
+void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output);
 void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
-                                      int16_t *pass1Output, int16_t skip_adding,
-                                      uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
-                                     int output_stride);
-void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
-                                     int16_t *pass1Output, int16_t skip_adding,
-                                     uint8_t *dest, int dest_stride);
+                                      int16_t *pass1_output,
+                                      int16_t skip_adding, uint8_t *dest,
+                                      int stride);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *input,
+                                               int16_t *output);
+void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
+                                               int16_t *output,
+                                               int16_t *pass1_output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest, int stride);
+#else
+#define vpx_idct16x16_256_add_neon_pass1_tran_low \
+  vpx_idct16x16_256_add_neon_pass1
+#define vpx_idct16x16_256_add_neon_pass2_tran_low \
+  vpx_idct16x16_256_add_neon_pass2
+#endif
+
+void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *input, int16_t *output);
+void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output,
+                                     int16_t *pass1_output);
 
 #if HAVE_NEON_ASM
 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
@@ -28,8 +41,8 @@ extern void vpx_push_neon(int64_t *store);
 extern void vpx_pop_neon(int64_t *store);
 #endif  // HAVE_NEON_ASM
 
-void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
-                                int dest_stride) {
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
 #if HAVE_NEON_ASM
   int64_t store_reg[8];
 #endif
@@ -44,47 +57,47 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1_tran_low(input, pass1_output);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
-                                   dest, dest_stride);
+  vpx_idct16x16_256_add_neon_pass2_tran_low(input + 1, row_idct_output,
+                                            pass1_output, 0, dest, stride);
 
   /* Parallel idct on the lower 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1_tran_low(input + 8 * 16, pass1_output);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
-                                   pass1_output, 0, dest, dest_stride);
+  vpx_idct16x16_256_add_neon_pass2_tran_low(
+      input + 8 * 16 + 1, row_idct_output + 8, pass1_output, 0, dest, stride);
 
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
   vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
-                                   pass1_output, 1, dest, dest_stride);
+                                   pass1_output, 1, dest, stride);
 
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
   vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
                                    row_idct_output + 8, pass1_output, 1,
-                                   dest + 8, dest_stride);
+                                   dest + 8, stride);
 
 #if HAVE_NEON_ASM
   // restore d8-d15 register values.
@@ -92,8 +105,8 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
 #endif
 }
 
-void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
-                               int dest_stride) {
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
 #if HAVE_NEON_ASM
   int64_t store_reg[8];
 #endif
@@ -108,38 +121,37 @@ void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
+  vpx_idct16x16_10_add_neon_pass1(input, pass1_output);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
-                                  dest, dest_stride);
+  vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output);
 
   /* Skip Parallel idct on the lower 8 rows as they are all 0s */
 
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
   vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
-                                   pass1_output, 1, dest, dest_stride);
+                                   pass1_output, 1, dest, stride);
 
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
   vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
                                    row_idct_output + 8, pass1_output, 1,
-                                   dest + 8, dest_stride);
+                                   dest + 8, stride);
 
 #if HAVE_NEON_ASM
   // restore d8-d15 register values.
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
new file mode 100644
index 00000000000..28b94655848
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -0,0 +1,714 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_8x8_s16(const tran_low_t *input, int16x8_t *const in0,
+                                int16x8_t *const in1, int16x8_t *const in2,
+                                int16x8_t *const in3, int16x8_t *const in4,
+                                int16x8_t *const in5, int16x8_t *const in6,
+                                int16x8_t *const in7) {
+  *in0 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in1 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in2 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in3 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in4 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in5 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in6 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in7 = load_tran_low_to_s16q(input);
+}
+
+static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0,
+                                int16x4_t *const in1, int16x4_t *const in2,
+                                int16x4_t *const in3, int16x4_t *const in4,
+                                int16x4_t *const in5, int16x4_t *const in6,
+                                int16x4_t *const in7) {
+  *in0 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in1 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in2 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in3 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in4 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in5 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in6 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in7 = load_tran_low_to_s16d(input);
+}
+
+// Only for the first pass of the  _135_ variant. Since it only uses values from
+// the top left 16x16 it can safely assume all the remaining values are 0 and
+// skip an awful lot of calculations. In fact, only the first 12 columns make
+// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
+// used so it skips any calls to input[12|13|14|15] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 12x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
+// coefficients as follows:
+//      0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+//  0   0   2   5  10  17  25  38  47  62  83 101 121
+//  1   1   4   8  15  22  30  45  58  74  92 112 133
+//  2   3   7  12  18  28  36  52  64  82 102 118
+//  3   6  11  16  23  31  43  60  73  90 109 126
+//  4   9  14  19  29  37  50  65  78  98 116 134
+//  5  13  20  26  35  44  54  72  85 105 123
+//  6  21  27  33  42  53  63  80  94 113 132
+//  7  24  32  39  48  57  71  88 104 120
+//  8  34  40  46  56  68  81  96 111 130
+//  9  41  49  55  67  77  91 107 124
+// 10  51  59  66  76  89  99 119 131
+// 11  61  69  75  87 100 114 129
+// 12  70  79  86  97 108 122
+// 13  84  93 103 110 125
+// 14  98 106 115 127
+// 15 117 128
+static void idct32_12_neon(const tran_low_t *input, int16_t *output) {
+  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
+  int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int16x8_t in8, in9, in10, in11;
+  int16x8_t s1_16, s1_18, s1_19, s1_20, s1_21, s1_23, s1_24, s1_26, s1_27,
+      s1_28, s1_29, s1_31;
+  int16x8_t s2_8, s2_10, s2_11, s2_12, s2_13, s2_15, s2_18, s2_19, s2_20, s2_21,
+      s2_26, s2_27, s2_28, s2_29;
+  int16x8_t s3_4, s3_7, s3_10, s3_11, s3_12, s3_13, s3_17, s3_18, s3_21, s3_22,
+      s3_25, s3_26, s3_29, s3_30;
+  int16x8_t s4_0, s4_2, s4_3, s4_9, s4_10, s4_13, s4_14, s4_16, s4_17, s4_18,
+      s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25, s4_26, s4_27, s4_28,
+      s4_29, s4_30, s4_31;
+  int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
+      s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
+      s5_29;
+  int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
+      s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
+      s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
+  int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
+      s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
+      s7_25, s7_26, s7_27;
+
+  load_8x8_s16(input, &in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
+  transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
+
+  load_4x8_s16(input + 8, &tmp0, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &tmp6,
+               &tmp7);
+  transpose_s16_4x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, &in8, &in9,
+                    &in10, &in11);
+
+  // stage 1
+  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+  s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
+  s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+
+  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+
+  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+  s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
+  s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+
+  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+  // stage 2
+  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+  s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
+  s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+
+  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+
+  s2_18 = vsubq_s16(s1_19, s1_18);
+  s2_19 = vaddq_s16(s1_18, s1_19);
+  s2_20 = vaddq_s16(s1_20, s1_21);
+  s2_21 = vsubq_s16(s1_20, s1_21);
+  s2_26 = vsubq_s16(s1_27, s1_26);
+  s2_27 = vaddq_s16(s1_26, s1_27);
+  s2_28 = vaddq_s16(s1_28, s1_29);
+  s2_29 = vsubq_s16(s1_28, s1_29);
+
+  // stage 3
+  s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+  s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+  s3_10 = vsubq_s16(s2_11, s2_10);
+  s3_11 = vaddq_s16(s2_10, s2_11);
+  s3_12 = vaddq_s16(s2_12, s2_13);
+  s3_13 = vsubq_s16(s2_12, s2_13);
+
+  s3_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+                                                   cospi_28_64);
+  s3_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+                                                   cospi_4_64);
+
+  s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
+                                                   -cospi_4_64);
+  s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
+                                                   cospi_28_64);
+
+  s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
+                                                   cospi_12_64);
+  s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
+                                                   cospi_20_64);
+
+  s3_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+                                                   -cospi_20_64);
+  s3_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+                                                   cospi_12_64);
+
+  // stage 4
+  s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+  s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
+  s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+
+  s4_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+                                                  cospi_24_64);
+  s4_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+                                                   cospi_8_64);
+
+  s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
+                                                   -cospi_8_64);
+  s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+                                                   cospi_24_64);
+
+  s4_16 = vaddq_s16(s1_16, s2_19);
+  s4_17 = vaddq_s16(s3_17, s3_18);
+  s4_18 = vsubq_s16(s3_17, s3_18);
+  s4_19 = vsubq_s16(s1_16, s2_19);
+  s4_20 = vsubq_s16(s1_23, s2_20);
+  s4_21 = vsubq_s16(s3_22, s3_21);
+  s4_22 = vaddq_s16(s3_21, s3_22);
+  s4_23 = vaddq_s16(s2_20, s1_23);
+  s4_24 = vaddq_s16(s1_24, s2_27);
+  s4_25 = vaddq_s16(s3_25, s3_26);
+  s4_26 = vsubq_s16(s3_25, s3_26);
+  s4_27 = vsubq_s16(s1_24, s2_27);
+  s4_28 = vsubq_s16(s1_31, s2_28);
+  s4_29 = vsubq_s16(s3_30, s3_29);
+  s4_30 = vaddq_s16(s3_29, s3_30);
+  s4_31 = vaddq_s16(s2_28, s1_31);
+
+  // stage 5
+  s5_0 = vaddq_s16(s4_0, s4_3);
+  s5_1 = vaddq_s16(s4_0, s4_2);
+  s5_2 = vsubq_s16(s4_0, s4_2);
+  s5_3 = vsubq_s16(s4_0, s4_3);
+
+  s5_5 = sub_multiply_shift_and_narrow_s16(s3_7, s3_4, cospi_16_64);
+  s5_6 = add_multiply_shift_and_narrow_s16(s3_4, s3_7, cospi_16_64);
+
+  s5_8 = vaddq_s16(s2_8, s3_11);
+  s5_9 = vaddq_s16(s4_9, s4_10);
+  s5_10 = vsubq_s16(s4_9, s4_10);
+  s5_11 = vsubq_s16(s2_8, s3_11);
+  s5_12 = vsubq_s16(s2_15, s3_12);
+  s5_13 = vsubq_s16(s4_14, s4_13);
+  s5_14 = vaddq_s16(s4_13, s4_14);
+  s5_15 = vaddq_s16(s2_15, s3_12);
+
+  s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
+                                                   cospi_24_64);
+  s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
+                                                   cospi_8_64);
+
+  s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
+                                                   cospi_24_64);
+  s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
+                                                   cospi_8_64);
+
+  s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
+                                                   -cospi_8_64);
+  s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
+                                                   cospi_24_64);
+
+  s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
+                                                   -cospi_8_64);
+  s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
+                                                   cospi_24_64);
+
+  // stage 6
+  s6_0 = vaddq_s16(s5_0, s3_7);
+  s6_1 = vaddq_s16(s5_1, s5_6);
+  s6_2 = vaddq_s16(s5_2, s5_5);
+  s6_3 = vaddq_s16(s5_3, s3_4);
+  s6_4 = vsubq_s16(s5_3, s3_4);
+  s6_5 = vsubq_s16(s5_2, s5_5);
+  s6_6 = vsubq_s16(s5_1, s5_6);
+  s6_7 = vsubq_s16(s5_0, s3_7);
+
+  s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
+  s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+
+  s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
+  s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+
+  s6_16 = vaddq_s16(s4_16, s4_23);
+  s6_17 = vaddq_s16(s4_17, s4_22);
+  s6_18 = vaddq_s16(s5_18, s5_21);
+  s6_19 = vaddq_s16(s5_19, s5_20);
+  s6_20 = vsubq_s16(s5_19, s5_20);
+  s6_21 = vsubq_s16(s5_18, s5_21);
+  s6_22 = vsubq_s16(s4_17, s4_22);
+  s6_23 = vsubq_s16(s4_16, s4_23);
+
+  s6_24 = vsubq_s16(s4_31, s4_24);
+  s6_25 = vsubq_s16(s4_30, s4_25);
+  s6_26 = vsubq_s16(s5_29, s5_26);
+  s6_27 = vsubq_s16(s5_28, s5_27);
+  s6_28 = vaddq_s16(s5_27, s5_28);
+  s6_29 = vaddq_s16(s5_26, s5_29);
+  s6_30 = vaddq_s16(s4_25, s4_30);
+  s6_31 = vaddq_s16(s4_24, s4_31);
+
+  // stage 7
+  s7_0 = vaddq_s16(s6_0, s5_15);
+  s7_1 = vaddq_s16(s6_1, s5_14);
+  s7_2 = vaddq_s16(s6_2, s6_13);
+  s7_3 = vaddq_s16(s6_3, s6_12);
+  s7_4 = vaddq_s16(s6_4, s6_11);
+  s7_5 = vaddq_s16(s6_5, s6_10);
+  s7_6 = vaddq_s16(s6_6, s5_9);
+  s7_7 = vaddq_s16(s6_7, s5_8);
+  s7_8 = vsubq_s16(s6_7, s5_8);
+  s7_9 = vsubq_s16(s6_6, s5_9);
+  s7_10 = vsubq_s16(s6_5, s6_10);
+  s7_11 = vsubq_s16(s6_4, s6_11);
+  s7_12 = vsubq_s16(s6_3, s6_12);
+  s7_13 = vsubq_s16(s6_2, s6_13);
+  s7_14 = vsubq_s16(s6_1, s5_14);
+  s7_15 = vsubq_s16(s6_0, s5_15);
+
+  s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
+  s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+
+  s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
+  s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+
+  s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
+  s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+
+  s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
+  s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+
+  // final stage
+  vst1q_s16(output, vaddq_s16(s7_0, s6_31));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_1, s6_30));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_2, s6_29));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_3, s6_28));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_4, s7_27));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_5, s7_26));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_6, s7_25));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_7, s7_24));
+  output += 16;
+
+  vst1q_s16(output, vaddq_s16(s7_8, s7_23));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_9, s7_22));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_10, s7_21));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_11, s7_20));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_12, s6_19));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_13, s6_18));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_14, s6_17));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_15, s6_16));
+  output += 16;
+
+  vst1q_s16(output, vsubq_s16(s7_15, s6_16));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_14, s6_17));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_13, s6_18));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_12, s6_19));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_11, s7_20));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_10, s7_21));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_9, s7_22));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_8, s7_23));
+  output += 16;
+
+  vst1q_s16(output, vsubq_s16(s7_7, s7_24));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_6, s7_25));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_5, s7_26));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_4, s7_27));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_3, s6_28));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_2, s6_29));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_1, s6_30));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_0, s6_31));
+}
+
+static void idct32_16_neon(const int16_t *input, uint8_t *output, int stride) {
+  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+      in13, in14, in15;
+  int16x8_t s1_16, s1_17, s1_18, s1_19, s1_20, s1_21, s1_22, s1_23, s1_24,
+      s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, s1_31;
+  int16x8_t s2_8, s2_9, s2_10, s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17,
+      s2_18, s2_19, s2_20, s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27,
+      s2_28, s2_29, s2_30, s2_31;
+  int16x8_t s3_4, s3_5, s3_6, s3_7, s3_8, s3_9, s3_10, s3_11, s3_12, s3_13,
+      s3_14, s3_15, s3_17, s3_18, s3_21, s3_22, s3_25, s3_26, s3_29, s3_30;
+  int16x8_t s4_0, s4_2, s4_3, s4_4, s4_5, s4_6, s4_7, s4_9, s4_10, s4_13, s4_14,
+      s4_16, s4_17, s4_18, s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25,
+      s4_26, s4_27, s4_28, s4_29, s4_30, s4_31;
+  int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
+      s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
+      s5_29;
+  int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
+      s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
+      s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
+  int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
+      s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
+      s7_25, s7_26, s7_27;
+  int16x8_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+  load_and_transpose_s16_8x8(input, 16, &in0, &in1, &in2, &in3, &in4, &in5,
+                             &in6, &in7);
+
+  load_and_transpose_s16_8x8(input + 8, 16, &in8, &in9, &in10, &in11, &in12,
+                             &in13, &in14, &in15);
+
+  // stage 1
+  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+  s1_17 = multiply_shift_and_narrow_s16(in15, -cospi_17_64);
+  s1_30 = multiply_shift_and_narrow_s16(in15, cospi_15_64);
+
+  s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
+  s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+
+  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+
+  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+  s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
+  s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+
+  s1_22 = multiply_shift_and_narrow_s16(in13, cospi_19_64);
+  s1_25 = multiply_shift_and_narrow_s16(in13, cospi_13_64);
+
+  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+  // stage 2
+  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+  s2_9 = multiply_shift_and_narrow_s16(in14, -cospi_18_64);
+  s2_14 = multiply_shift_and_narrow_s16(in14, cospi_14_64);
+
+  s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
+  s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+
+  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+
+  s2_16 = vaddq_s16(s1_16, s1_17);
+  s2_17 = vsubq_s16(s1_16, s1_17);
+  s2_18 = vsubq_s16(s1_19, s1_18);
+  s2_19 = vaddq_s16(s1_18, s1_19);
+  s2_20 = vaddq_s16(s1_20, s1_21);
+  s2_21 = vsubq_s16(s1_20, s1_21);
+  s2_22 = vsubq_s16(s1_23, s1_22);
+  s2_23 = vaddq_s16(s1_22, s1_23);
+  s2_24 = vaddq_s16(s1_24, s1_25);
+  s2_25 = vsubq_s16(s1_24, s1_25);
+  s2_26 = vsubq_s16(s1_27, s1_26);
+  s2_27 = vaddq_s16(s1_26, s1_27);
+  s2_28 = vaddq_s16(s1_28, s1_29);
+  s2_29 = vsubq_s16(s1_28, s1_29);
+  s2_30 = vsubq_s16(s1_31, s1_30);
+  s2_31 = vaddq_s16(s1_30, s1_31);
+
+  // stage 3
+  s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+  s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+  s3_5 = multiply_shift_and_narrow_s16(in12, -cospi_20_64);
+  s3_6 = multiply_shift_and_narrow_s16(in12, cospi_12_64);
+
+  s3_8 = vaddq_s16(s2_8, s2_9);
+  s3_9 = vsubq_s16(s2_8, s2_9);
+  s3_10 = vsubq_s16(s2_11, s2_10);
+  s3_11 = vaddq_s16(s2_10, s2_11);
+  s3_12 = vaddq_s16(s2_12, s2_13);
+  s3_13 = vsubq_s16(s2_12, s2_13);
+  s3_14 = vsubq_s16(s2_15, s2_14);
+  s3_15 = vaddq_s16(s2_14, s2_15);
+
+  s3_17 = multiply_accumulate_shift_and_narrow_s16(s2_17, -cospi_4_64, s2_30,
+                                                   cospi_28_64);
+  s3_30 = multiply_accumulate_shift_and_narrow_s16(s2_17, cospi_28_64, s2_30,
+                                                   cospi_4_64);
+
+  s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
+                                                   -cospi_4_64);
+  s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
+                                                   cospi_28_64);
+
+  s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
+                                                   cospi_12_64);
+  s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
+                                                   cospi_20_64);
+
+  s3_22 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_12_64, s2_25,
+                                                   -cospi_20_64);
+  s3_25 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_20_64, s2_25,
+                                                   cospi_12_64);
+
+  // stage 4
+  s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+  s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
+  s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+
+  s4_4 = vaddq_s16(s3_4, s3_5);
+  s4_5 = vsubq_s16(s3_4, s3_5);
+  s4_6 = vsubq_s16(s3_7, s3_6);
+  s4_7 = vaddq_s16(s3_6, s3_7);
+
+  s4_9 = multiply_accumulate_shift_and_narrow_s16(s3_9, -cospi_8_64, s3_14,
+                                                  cospi_24_64);
+  s4_14 = multiply_accumulate_shift_and_narrow_s16(s3_9, cospi_24_64, s3_14,
+                                                   cospi_8_64);
+
+  s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
+                                                   -cospi_8_64);
+  s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+                                                   cospi_24_64);
+
+  s4_16 = vaddq_s16(s2_16, s2_19);
+  s4_17 = vaddq_s16(s3_17, s3_18);
+  s4_18 = vsubq_s16(s3_17, s3_18);
+  s4_19 = vsubq_s16(s2_16, s2_19);
+  s4_20 = vsubq_s16(s2_23, s2_20);
+  s4_21 = vsubq_s16(s3_22, s3_21);
+  s4_22 = vaddq_s16(s3_21, s3_22);
+  s4_23 = vaddq_s16(s2_20, s2_23);
+  s4_24 = vaddq_s16(s2_24, s2_27);
+  s4_25 = vaddq_s16(s3_25, s3_26);
+  s4_26 = vsubq_s16(s3_25, s3_26);
+  s4_27 = vsubq_s16(s2_24, s2_27);
+  s4_28 = vsubq_s16(s2_31, s2_28);
+  s4_29 = vsubq_s16(s3_30, s3_29);
+  s4_30 = vaddq_s16(s3_29, s3_30);
+  s4_31 = vaddq_s16(s2_28, s2_31);
+
+  // stage 5
+  s5_0 = vaddq_s16(s4_0, s4_3);
+  s5_1 = vaddq_s16(s4_0, s4_2);
+  s5_2 = vsubq_s16(s4_0, s4_2);
+  s5_3 = vsubq_s16(s4_0, s4_3);
+
+  s5_5 = sub_multiply_shift_and_narrow_s16(s4_6, s4_5, cospi_16_64);
+  s5_6 = add_multiply_shift_and_narrow_s16(s4_5, s4_6, cospi_16_64);
+
+  s5_8 = vaddq_s16(s3_8, s3_11);
+  s5_9 = vaddq_s16(s4_9, s4_10);
+  s5_10 = vsubq_s16(s4_9, s4_10);
+  s5_11 = vsubq_s16(s3_8, s3_11);
+  s5_12 = vsubq_s16(s3_15, s3_12);
+  s5_13 = vsubq_s16(s4_14, s4_13);
+  s5_14 = vaddq_s16(s4_13, s4_14);
+  s5_15 = vaddq_s16(s3_15, s3_12);
+
+  s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
+                                                   cospi_24_64);
+  s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
+                                                   cospi_8_64);
+
+  s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
+                                                   cospi_24_64);
+  s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
+                                                   cospi_8_64);
+
+  s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
+                                                   -cospi_8_64);
+  s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
+                                                   cospi_24_64);
+
+  s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
+                                                   -cospi_8_64);
+  s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
+                                                   cospi_24_64);
+
+  // stage 6
+  s6_0 = vaddq_s16(s5_0, s4_7);
+  s6_1 = vaddq_s16(s5_1, s5_6);
+  s6_2 = vaddq_s16(s5_2, s5_5);
+  s6_3 = vaddq_s16(s5_3, s4_4);
+  s6_4 = vsubq_s16(s5_3, s4_4);
+  s6_5 = vsubq_s16(s5_2, s5_5);
+  s6_6 = vsubq_s16(s5_1, s5_6);
+  s6_7 = vsubq_s16(s5_0, s4_7);
+
+  s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
+  s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+
+  s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
+  s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+
+  s6_16 = vaddq_s16(s4_16, s4_23);
+  s6_17 = vaddq_s16(s4_17, s4_22);
+  s6_18 = vaddq_s16(s5_18, s5_21);
+  s6_19 = vaddq_s16(s5_19, s5_20);
+  s6_20 = vsubq_s16(s5_19, s5_20);
+  s6_21 = vsubq_s16(s5_18, s5_21);
+  s6_22 = vsubq_s16(s4_17, s4_22);
+  s6_23 = vsubq_s16(s4_16, s4_23);
+  s6_24 = vsubq_s16(s4_31, s4_24);
+  s6_25 = vsubq_s16(s4_30, s4_25);
+  s6_26 = vsubq_s16(s5_29, s5_26);
+  s6_27 = vsubq_s16(s5_28, s5_27);
+  s6_28 = vaddq_s16(s5_27, s5_28);
+  s6_29 = vaddq_s16(s5_26, s5_29);
+  s6_30 = vaddq_s16(s4_25, s4_30);
+  s6_31 = vaddq_s16(s4_24, s4_31);
+
+  // stage 7
+  s7_0 = vaddq_s16(s6_0, s5_15);
+  s7_1 = vaddq_s16(s6_1, s5_14);
+  s7_2 = vaddq_s16(s6_2, s6_13);
+  s7_3 = vaddq_s16(s6_3, s6_12);
+  s7_4 = vaddq_s16(s6_4, s6_11);
+  s7_5 = vaddq_s16(s6_5, s6_10);
+  s7_6 = vaddq_s16(s6_6, s5_9);
+  s7_7 = vaddq_s16(s6_7, s5_8);
+  s7_8 = vsubq_s16(s6_7, s5_8);
+  s7_9 = vsubq_s16(s6_6, s5_9);
+  s7_10 = vsubq_s16(s6_5, s6_10);
+  s7_11 = vsubq_s16(s6_4, s6_11);
+  s7_12 = vsubq_s16(s6_3, s6_12);
+  s7_13 = vsubq_s16(s6_2, s6_13);
+  s7_14 = vsubq_s16(s6_1, s5_14);
+  s7_15 = vsubq_s16(s6_0, s5_15);
+
+  s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
+  s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+
+  s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
+  s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+
+  s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
+  s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+
+  s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
+  s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+
+  // final stage
+  out0 = vaddq_s16(s7_0, s6_31);
+  out1 = vaddq_s16(s7_1, s6_30);
+  out2 = vaddq_s16(s7_2, s6_29);
+  out3 = vaddq_s16(s7_3, s6_28);
+  out4 = vaddq_s16(s7_4, s7_27);
+  out5 = vaddq_s16(s7_5, s7_26);
+  out6 = vaddq_s16(s7_6, s7_25);
+  out7 = vaddq_s16(s7_7, s7_24);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
+                       stride);
+
+  out0 = vaddq_s16(s7_8, s7_23);
+  out1 = vaddq_s16(s7_9, s7_22);
+  out2 = vaddq_s16(s7_10, s7_21);
+  out3 = vaddq_s16(s7_11, s7_20);
+  out4 = vaddq_s16(s7_12, s6_19);
+  out5 = vaddq_s16(s7_13, s6_18);
+  out6 = vaddq_s16(s7_14, s6_17);
+  out7 = vaddq_s16(s7_15, s6_16);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (8 * stride), stride);
+
+  out0 = vsubq_s16(s7_15, s6_16);
+  out1 = vsubq_s16(s7_14, s6_17);
+  out2 = vsubq_s16(s7_13, s6_18);
+  out3 = vsubq_s16(s7_12, s6_19);
+  out4 = vsubq_s16(s7_11, s7_20);
+  out5 = vsubq_s16(s7_10, s7_21);
+  out6 = vsubq_s16(s7_9, s7_22);
+  out7 = vsubq_s16(s7_8, s7_23);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (16 * stride), stride);
+
+  out0 = vsubq_s16(s7_7, s7_24);
+  out1 = vsubq_s16(s7_6, s7_25);
+  out2 = vsubq_s16(s7_5, s7_26);
+  out3 = vsubq_s16(s7_4, s7_27);
+  out4 = vsubq_s16(s7_3, s6_28);
+  out5 = vsubq_s16(s7_2, s6_29);
+  out6 = vsubq_s16(s7_1, s6_30);
+  out7 = vsubq_s16(s7_0, s6_31);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (24 * stride), stride);
+}
+
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int i;
+  int16_t temp[32 * 16];
+  int16_t *t = temp;
+
+  idct32_12_neon(input, temp);
+  idct32_12_neon(input + 32 * 8, temp + 8);
+
+  for (i = 0; i < 32; i += 8) {
+    idct32_16_neon(t, dest, stride);
+    t += (16 * 8);
+    dest += 8;
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
index 6be4b01229b..604d82abd18 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -10,127 +10,48 @@
 
 #include <arm_neon.h>
 
-#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
 
-static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
-                           uint8x16_t *q9u8, uint8x16_t *q10u8,
-                           uint8x16_t *q11u8, uint8x16_t *q12u8,
-                           uint8x16_t *q13u8, uint8x16_t *q14u8,
-                           uint8x16_t *q15u8) {
-  *q8u8 = vld1q_u8(d);
-  d += d_stride;
-  *q9u8 = vld1q_u8(d);
-  d += d_stride;
-  *q10u8 = vld1q_u8(d);
-  d += d_stride;
-  *q11u8 = vld1q_u8(d);
-  d += d_stride;
-  *q12u8 = vld1q_u8(d);
-  d += d_stride;
-  *q13u8 = vld1q_u8(d);
-  d += d_stride;
-  *q14u8 = vld1q_u8(d);
-  d += d_stride;
-  *q15u8 = vld1q_u8(d);
+static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride,
+                                              const uint8x16_t res) {
+  const uint8x16_t a0 = vld1q_u8(*dest);
+  const uint8x16_t a1 = vld1q_u8(*dest + 16);
+  const uint8x16_t b0 = vqaddq_u8(a0, res);
+  const uint8x16_t b1 = vqaddq_u8(a1, res);
+  vst1q_u8(*dest, b0);
+  vst1q_u8(*dest + 16, b1);
+  *dest += stride;
 }
 
-static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
-                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
-                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
-                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
-                                 uint8x16_t *q15u8) {
-  *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
-  *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
-  *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
-  *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
-  *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
-  *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
-  *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
-  *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
-}
-
-static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
-                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
-                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
-                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
-                                 uint8x16_t *q15u8) {
-  *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
-  *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
-  *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
-  *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
-  *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
-  *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
-  *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
-  *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
-}
-
-static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
-                           uint8x16_t *q9u8, uint8x16_t *q10u8,
-                           uint8x16_t *q11u8, uint8x16_t *q12u8,
-                           uint8x16_t *q13u8, uint8x16_t *q14u8,
-                           uint8x16_t *q15u8) {
-  vst1q_u8(d, *q8u8);
-  d += d_stride;
-  vst1q_u8(d, *q9u8);
-  d += d_stride;
-  vst1q_u8(d, *q10u8);
-  d += d_stride;
-  vst1q_u8(d, *q11u8);
-  d += d_stride;
-  vst1q_u8(d, *q12u8);
-  d += d_stride;
-  vst1q_u8(d, *q13u8);
-  d += d_stride;
-  vst1q_u8(d, *q14u8);
-  d += d_stride;
-  vst1q_u8(d, *q15u8);
+static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
+                                              const uint8x16_t res) {
+  const uint8x16_t a0 = vld1q_u8(*dest);
+  const uint8x16_t a1 = vld1q_u8(*dest + 16);
+  const uint8x16_t b0 = vqsubq_u8(a0, res);
+  const uint8x16_t b1 = vqsubq_u8(a1, res);
+  vst1q_u8(*dest, b0);
+  vst1q_u8(*dest + 16, b1);
+  *dest += stride;
 }
 
 void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
-                              int dest_stride) {
-  uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-  int i, j, dest_stride8;
-  uint8_t *d;
-  int16_t a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  dest_stride8 = dest_stride * 8;
-  if (a1 >= 0) {  // diff_positive_32_32
-    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8((uint8_t)a1);
-    for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
-      d = dest;
-      for (j = 0; j < 4; j++) {
-        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                      &q14u8, &q15u8);
-        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        d += dest_stride8;
-      }
+                              int stride) {
+  int i;
+  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+
+  if (a1 >= 0) {
+    const uint8x16_t dc = create_dcq(a1);
+    for (i = 0; i < 32; i++) {
+      idct32x32_1_add_pos_kernel(&dest, stride, dc);
     }
-  } else {  // diff_negative_32_32
-    a1 = -a1;
-    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8((uint8_t)a1);
-    for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
-      d = dest;
-      for (j = 0; j < 4; j++) {
-        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                      &q14u8, &q15u8);
-        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        d += dest_stride8;
-      }
+  } else {
+    const uint8x16_t dc = create_dcq(-a1);
+    for (i = 0; i < 32; i++) {
+      idct32x32_1_add_neg_kernel(&dest, stride, dc);
     }
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
index ebec9df54ad..b56deeea6de 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
 // Only for the first pass of the  _34_ variant. Since it only uses values from
@@ -34,7 +35,7 @@
 // 5 13 20 26
 // 6 21 27 33
 // 7 24 32
-static void idct32_6_neon(const int16_t *input, int16_t *output) {
+static void idct32_6_neon(const tran_low_t *input, int16_t *output) {
   int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
   int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
       s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
@@ -46,8 +47,22 @@ static void idct32_6_neon(const int16_t *input, int16_t *output) {
       s2_31;
   int16x8_t s3_24, s3_25, s3_26, s3_27;
 
-  load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5,
-                             &in6, &in7);
+  in0 = load_tran_low_to_s16q(input);
+  input += 32;
+  in1 = load_tran_low_to_s16q(input);
+  input += 32;
+  in2 = load_tran_low_to_s16q(input);
+  input += 32;
+  in3 = load_tran_low_to_s16q(input);
+  input += 32;
+  in4 = load_tran_low_to_s16q(input);
+  input += 32;
+  in5 = load_tran_low_to_s16q(input);
+  input += 32;
+  in6 = load_tran_low_to_s16q(input);
+  input += 32;
+  in7 = load_tran_low_to_s16q(input);
+  transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
 
   // stage 1
   // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
@@ -503,7 +518,7 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
                        output + (24 * stride), stride);
 }
 
-void vpx_idct32x32_34_add_neon(const int16_t *input, uint8_t *dest,
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
                                int stride) {
   int i;
   int16_t temp[32 * 8];
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
index 4eff9b970d9..de1bf978750 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
@@ -12,6 +12,7 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
@@ -146,55 +147,101 @@ static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
   q11s32 = vaddq_s32(q12s32, q11s32);
   q10s32 = vaddq_s32(q10s32, q15s32);
 
-  *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
-  *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
+  *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14));
+  *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14));
+}
+
+static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1,
+                               int16x8_t *s2, int16x8_t *s3, int16x8_t *s4,
+                               int16x8_t *s5, int16x8_t *s6, int16x8_t *s7) {
+  *s0 = vld1q_s16(in);
+  in += 32;
+  *s1 = vld1q_s16(in);
+  in += 32;
+  *s2 = vld1q_s16(in);
+  in += 32;
+  *s3 = vld1q_s16(in);
+  in += 32;
+  *s4 = vld1q_s16(in);
+  in += 32;
+  *s5 = vld1q_s16(in);
+  in += 32;
+  *s6 = vld1q_s16(in);
+  in += 32;
+  *s7 = vld1q_s16(in);
+}
+
+static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1,
+                                               int16x8_t a2, int16x8_t a3,
+                                               int16x8_t a4, int16x8_t a5,
+                                               int16x8_t a6, int16x8_t a7,
+                                               int16_t **out) {
+  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  vst1q_s16(*out, a0);
+  *out += 8;
+  vst1q_s16(*out, a1);
+  *out += 8;
+  vst1q_s16(*out, a2);
+  *out += 8;
+  vst1q_s16(*out, a3);
+  *out += 8;
+  vst1q_s16(*out, a4);
+  *out += 8;
+  vst1q_s16(*out, a5);
+  *out += 8;
+  vst1q_s16(*out, a6);
+  *out += 8;
+  vst1q_s16(*out, a7);
+  *out += 8;
 }
 
 static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) {
-  const int16_t *in;
   int i;
-  const int stride = 32;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  for (i = 0; i < 4; i++, input += 8) {
+    load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void load_s16x8q_tran_low(const tran_low_t *in, int16x8_t *s0,
+                                        int16x8_t *s1, int16x8_t *s2,
+                                        int16x8_t *s3, int16x8_t *s4,
+                                        int16x8_t *s5, int16x8_t *s6,
+                                        int16x8_t *s7) {
+  *s0 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s1 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s2 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s3 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s4 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s5 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s6 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s7 = load_tran_low_to_s16q(in);
+}
+
+static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input,
+                                                  int16_t *t_buf) {
+  int i;
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
 
   for (i = 0; i < 4; i++, input += 8) {
-    in = input;
-    q8s16 = vld1q_s16(in);
-    in += stride;
-    q9s16 = vld1q_s16(in);
-    in += stride;
-    q10s16 = vld1q_s16(in);
-    in += stride;
-    q11s16 = vld1q_s16(in);
-    in += stride;
-    q12s16 = vld1q_s16(in);
-    in += stride;
-    q13s16 = vld1q_s16(in);
-    in += stride;
-    q14s16 = vld1q_s16(in);
-    in += stride;
-    q15s16 = vld1q_s16(in);
-
-    transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                      &q14s16, &q15s16);
-
-    vst1q_s16(t_buf, q8s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q9s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q10s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q11s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q12s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q13s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q14s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q15s16);
-    t_buf += 8;
+    load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
   }
 }
+#else  // !CONFIG_VP9_HIGHBITDEPTH
+#define idct32_transpose_pair_tran_low idct32_transpose_pair
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
                                              int16x8_t q3s16, int16x8_t q6s16,
@@ -383,16 +430,21 @@ void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
   int16_t trans_buf[32 * 8];
   int16_t pass1[32 * 32];
   int16_t pass2[32 * 32];
+  const int16_t *input_pass2 = pass1;  // input of pass2 is the result of pass1
   int16_t *out;
   int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
 
   for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
-       idct32_pass_loop++,
-      input = pass1,  // the input of pass2 is the result of pass1
-       out = pass2) {
-    for (i = 0; i < 4; i++, input += 32 * 8, out += 8) {  // idct32_bands_loop
-      idct32_transpose_pair(input, trans_buf);
+       idct32_pass_loop++, out = pass2) {
+    for (i = 0; i < 4; i++, out += 8) {  // idct32_bands_loop
+      if (idct32_pass_loop == 0) {
+        idct32_transpose_pair_tran_low(input, trans_buf);
+        input += 32 * 8;
+      } else {
+        idct32_transpose_pair(input_pass2, trans_buf);
+        input_pass2 += 32 * 8;
+      }
 
       // -----------------------------------------
       // BLOCK A: 16-19,28-31
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
index cbfab361af8..d83421e9e66 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
@@ -15,12 +15,11 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
+;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
-; r2  int dest_stride)
+; r2  int stride)
 
 |vpx_idct4x4_1_add_neon| PROC
     ldrsh            r0, [r0]
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
index 525aac05a84..d1eae24a222 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -9,39 +9,37 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
 
-void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
-                            int dest_stride) {
-  uint8x8_t d6u8;
-  uint32x2_t d2u32 = vdup_n_u32(0);
-  uint16x8_t q8u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  q0s16 = vdupq_n_s16(a1);
+static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
+                                        const int16x8_t res,
+                                        uint32x2_t *const d) {
+  uint16x8_t a;
+  uint8x8_t b;
+  *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0);
+  *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1);
+  a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d));
+  b = vqmovun_s16(vreinterpretq_s16_u16(a));
+  vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0);
+  *dest += stride;
+  vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1);
+  *dest += stride;
+}
 
-  // dc_only_idct_add
-  d1 = d2 = dest;
-  for (i = 0; i < 2; i++) {
-    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
-    d1 += dest_stride;
-    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
-    d1 += dest_stride;
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  uint32x2_t d = vdup_n_u32(0);
 
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
-    d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  assert(!((intptr_t)dest % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
 
-    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
-    d2 += dest_stride;
-    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
-    d2 += dest_stride;
-  }
+  idct4x4_1_add_kernel(&dest, stride, dc, &d);
+  idct4x4_1_add_kernel(&dest, stride, dc, &d);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
index bd4e86ded25..184d218941c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -18,11 +18,11 @@
     INCLUDE vpx_dsp/arm/idct_neon.asm.S
 
     AREA     Block, CODE, READONLY ; name this block of code
-;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
-; r2  int dest_stride)
+; r2  int stride)
 
 |vpx_idct4x4_16_add_neon| PROC
 
@@ -72,16 +72,15 @@
     ; do the transform on transposed rows
 
     ; stage 1
-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
-
     vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
     vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
 
     ; (input[0] + input[2]) * cospi_16_64;
     ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q13, d23, d21
-    vmull.s16 q14, d24, d21
+    vmull.s16 q8,  d16, d21
+    vmull.s16 q14, d18, d21
+    vadd.s32  q13, q8,  q14
+    vsub.s32  q14, q8,  q14
 
     ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
     ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
@@ -89,10 +88,10 @@
     vmlal.s16 q1,  d19, d22
 
     ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
+    vrshrn.s32 d26, q13, #14
+    vrshrn.s32 d27, q14, #14
+    vrshrn.s32 d29, q15, #14
+    vrshrn.s32 d28, q1,  #14
 
     ; stage 2
     ; output[0] = step[0] + step[3];
@@ -140,10 +139,10 @@
     vmlal.s16 q1,  d19, d22
 
     ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
+    vrshrn.s32 d26, q13, #14
+    vrshrn.s32 d27, q14, #14
+    vrshrn.s32 d29, q15, #14
+    vrshrn.s32 d28, q1,  #14
 
     ; stage 2
     ; output[0] = step[0] + step[3];
@@ -168,7 +167,7 @@
     vld1.32 {d27[1]}, [r1], r2
     vld1.32 {d27[0]}, [r1]  ; no post-increment
 
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]
     vaddw.u8 q8, q8, d26
     vaddw.u8 q9, q9, d27
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
index 8f669c90765..bff98cbc169 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
@@ -9,139 +9,56 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
 void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
-                             int dest_stride) {
-  uint8x8_t d26u8, d27u8;
-  uint32x2_t d26u32, d27u32;
-  uint16x8_t q8u16, q9u16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
-  int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
-  int16x8_t q8s16, q9s16, q13s16, q14s16;
-  int32x4_t q1s32, q13s32, q14s32, q15s32;
-  int16x4x2_t d0x2s16, d1x2s16;
-  int32x4x2_t q0x2s32;
-  uint8_t *d;
-
-  d26u32 = d27u32 = vdup_n_u32(0);
-
-  q8s16 = load_tran_low_to_s16(input);
-  q9s16 = load_tran_low_to_s16(input + 8);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-
-  d0x2s16 = vtrn_s16(d16s16, d17s16);
-  d1x2s16 = vtrn_s16(d18s16, d19s16);
-  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
-  d20s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d21s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
-  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
-  d22s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  // stage 1
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, d22s16);
-  q1s32 = vmull_s16(d17s16, d20s16);
-  q13s32 = vmull_s16(d23s16, d21s16);
-  q14s32 = vmull_s16(d24s16, d21s16);
-
-  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-
-  // stage 2
-  q8s16 = vaddq_s16(q13s16, q14s16);
-  q9s16 = vsubq_s16(q13s16, q14s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
-  d19s16 = vget_low_s16(q9s16);
-
-  d0x2s16 = vtrn_s16(d16s16, d17s16);
-  d1x2s16 = vtrn_s16(d18s16, d19s16);
-  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
-  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
-  // do the transform on columns
-  // stage 1
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, d22s16);
-  q1s32 = vmull_s16(d17s16, d20s16);
-  q13s32 = vmull_s16(d23s16, d21s16);
-  q14s32 = vmull_s16(d24s16, d21s16);
-
-  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-
-  // stage 2
-  q8s16 = vaddq_s16(q13s16, q14s16);
-  q9s16 = vsubq_s16(q13s16, q14s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 4);
-  q9s16 = vrshrq_n_s16(q9s16, 4);
-
-  d = dest;
-  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
-  d += dest_stride;
-  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
-  d += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
-  d += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
-  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-  d = dest;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+                             int stride) {
+  const uint8_t *dst = dest;
+  const int16x4_t cospis = vld1_s16(kCospi);
+  uint32x2_t dest01_u32 = vdup_n_u32(0);
+  uint32x2_t dest32_u32 = vdup_n_u32(0);
+  int16x8_t a0, a1;
+  uint8x8_t d01, d32;
+  uint16x8_t d01_u16, d32_u16;
+
+  assert(!((intptr_t)dest % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
+
+  // Rows
+  a0 = load_tran_low_to_s16q(input);
+  a1 = load_tran_low_to_s16q(input + 8);
+  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+
+  // Columns
+  a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
+  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+  a0 = vrshrq_n_s16(a0, 4);
+  a1 = vrshrq_n_s16(a1, 4);
+
+  dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0);
+  dst += stride;
+  dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1);
+  dst += stride;
+  dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1);
+  dst += stride;
+  dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0);
+
+  d01_u16 =
+      vaddw_u8(vreinterpretq_u16_s16(a0), vreinterpret_u8_u32(dest01_u32));
+  d32_u16 =
+      vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32));
+  d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16));
+  d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16));
+
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0);
+  dest += stride;
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1);
+  dest += stride;
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1);
+  dest += stride;
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
index e4531c6e97f..29f678a0382 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
@@ -15,12 +15,11 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
+;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
-; r2  int dest_stride)
+; r2  int stride)
 
 |vpx_idct8x8_1_add_neon| PROC
     ldrsh            r0, [r0]
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
index eee41e6c6b1..7bcce913bdb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -12,51 +12,53 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
 
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
-                            int dest_stride) {
-  uint8x8_t d2u8, d3u8, d30u8, d31u8;
-  uint64x1_t d2u64, d3u64, d4u64, d5u64;
-  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-
-  q0s16 = vdupq_n_s16(a1);
-  q0u16 = vreinterpretq_u16_s16(q0s16);
+static INLINE uint8x8_t create_dcd(const int16_t dc) {
+  int16x8_t t = vdupq_n_s16(dc);
+  return vqmovun_s16(t);
+}
 
-  d1 = d2 = dest;
-  for (i = 0; i < 2; i++) {
-    d2u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d4u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d5u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
+static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride,
+                                            const uint8x8_t res) {
+  const uint8x8_t a = vld1_u8(*dest);
+  const uint8x8_t b = vqadd_u8(a, res);
+  vst1_u8(*dest, b);
+  *dest += stride;
+}
 
-    q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-    q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-    q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-    q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride,
+                                            const uint8x8_t res) {
+  const uint8x8_t a = vld1_u8(*dest);
+  const uint8x8_t b = vqsub_u8(a, res);
+  vst1_u8(*dest, b);
+  *dest += stride;
+}
 
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-    d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
 
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
-    d2 += dest_stride;
+  if (a1 >= 0) {
+    const uint8x8_t dc = create_dcd(a1);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+  } else {
+    const uint8x8_t dc = create_dcd(-a1);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
index a5c9c927d67..2bfbcc5a52c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
@@ -47,12 +47,12 @@
     vmlsl.s16       q6, d23, d3
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
+    vrshrn.s32      d8, q2, #14               ; >> 14
+    vrshrn.s32      d9, q3, #14               ; >> 14
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
+    vrshrn.s32      d10, q5, #14              ; >> 14
+    vrshrn.s32      d11, q6, #14              ; >> 14
 
     ; input[1] * cospi_4_64
     vmull.s16       q2, d18, d1
@@ -71,15 +71,15 @@
     vmlal.s16       q13, d23, d2
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
+    vrshrn.s32      d14, q2, #14              ; >> 14
+    vrshrn.s32      d15, q3, #14              ; >> 14
 
     ; stage 2 & stage 3 - even half
     vdup.16         d0, r7                    ; duplicate cospi_16_64
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14              ; >> 14
+    vrshrn.s32      d12, q9, #14              ; >> 14
+    vrshrn.s32      d13, q13, #14             ; >> 14
 
     ; input[0] * cospi_16_64
     vmull.s16       q2, d16, d0
@@ -101,12 +101,12 @@
     vdup.16         d1, r9                    ; duplicate cospi_8_64
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
+    vrshrn.s32      d18, q2, #14              ; >> 14
+    vrshrn.s32      d19, q3, #14              ; >> 14
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14              ; >> 14
-    vqrshrn.s32     d23, q15, #14              ; >> 14
+    vrshrn.s32      d22, q13, #14             ; >> 14
+    vrshrn.s32      d23, q15, #14             ; >> 14
 
     ; input[1] * cospi_24_64 - input[3] * cospi_8_64
     ; input[1] * cospi_24_64
@@ -126,12 +126,12 @@
     vmlal.s16       q12, d29, d0
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
+    vrshrn.s32      d26, q2, #14              ; >> 14
+    vrshrn.s32      d27, q3, #14              ; >> 14
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14              ; >> 14
+    vrshrn.s32      d30, q8, #14              ; >> 14
+    vrshrn.s32      d31, q12, #14             ; >> 14
 
     vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
     vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
@@ -164,12 +164,12 @@
     vmlal.s16       q12, d27, d16
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
+    vrshrn.s32      d10, q9, #14              ; >> 14
+    vrshrn.s32      d11, q10, #14             ; >> 14
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
+    vrshrn.s32      d12, q11, #14             ; >> 14
+    vrshrn.s32      d13, q12, #14             ; >> 14
 
     ; stage 4
     vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
@@ -200,11 +200,11 @@
     MEND
 
     AREA    Block, CODE, READONLY ; name this block of code
-;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
-; r2  int dest_stride)
+; r2  int stride)
 
 |vpx_idct8x8_64_add_neon| PROC
     push            {r4-r9}
@@ -270,7 +270,7 @@
     vld1.64         {d6}, [r1], r2
     vld1.64         {d7}, [r1]
 
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]
     vaddw.u8        q8, q8, d0
     vaddw.u8        q9, q9, d1
     vaddw.u8        q10, q10, d2
@@ -305,11 +305,11 @@
     bx              lr
     ENDP  ; |vpx_idct8x8_64_add_neon|
 
-;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
-; r2  int dest_stride)
+; r2  int stride)
 
 |vpx_idct8x8_12_add_neon| PROC
     push            {r4-r9}
@@ -423,12 +423,12 @@
     vmlal.s16       q12, d27, d16
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
+    vrshrn.s32      d10, q9, #14              ; >> 14
+    vrshrn.s32      d11, q10, #14             ; >> 14
 
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
+    vrshrn.s32      d12, q11, #14             ; >> 14
+    vrshrn.s32      d13, q12, #14             ; >> 14
 
     ; stage 4
     vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
@@ -469,7 +469,7 @@
     vld1.64         {d6}, [r1], r2
     vld1.64         {d7}, [r1]
 
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]
     vaddw.u8        q8, q8, d0
     vaddw.u8        q9, q9, d1
     vaddw.u8        q10, q10, d2
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
index 159a6ec9891..279da67d74f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
@@ -16,431 +16,111 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                              int16x8_t *q10s16, int16x8_t *q11s16,
-                              int16x8_t *q12s16, int16x8_t *q13s16,
-                              int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
-  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
-  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
-  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d26s16, d2s16);
-  q6s32 = vmull_s16(d27s16, d2s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q2s32 = vmull_s16(d18s16, d1s16);
-  q3s32 = vmull_s16(d19s16, d1s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q13s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-  d14s16 = vqrshrn_n_s32(q2s32, 14);
-  d15s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q13s32, 14);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d0s16);
-  q3s32 = vmull_s16(d17s16, d0s16);
-  q13s32 = vmull_s16(d16s16, d0s16);
-  q15s32 = vmull_s16(d17s16, d0s16);
-
-  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_24_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_8_64);
-
-  d18s16 = vqrshrn_n_s32(q2s32, 14);
-  d19s16 = vqrshrn_n_s32(q3s32, 14);
-  d22s16 = vqrshrn_n_s32(q13s32, 14);
-  d23s16 = vqrshrn_n_s32(q15s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q2s32 = vmull_s16(d20s16, d0s16);
-  q3s32 = vmull_s16(d21s16, d0s16);
-  q8s32 = vmull_s16(d20s16, d1s16);
-  q12s32 = vmull_s16(d21s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-  d26s16 = vqrshrn_n_s32(q2s32, 14);
-  d27s16 = vqrshrn_n_s32(q3s32, 14);
-  d30s16 = vqrshrn_n_s32(q8s32, 14);
-  d31s16 = vqrshrn_n_s32(q12s32, 14);
-  *q13s16 = vcombine_s16(d26s16, d27s16);
-  *q15s16 = vcombine_s16(d30s16, d31s16);
-
-  q0s16 = vaddq_s16(*q9s16, *q15s16);
-  q1s16 = vaddq_s16(*q11s16, *q13s16);
-  q2s16 = vsubq_s16(*q11s16, *q13s16);
-  q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-  *q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  *q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  *q8s16 = vaddq_s16(q0s16, q7s16);
-  *q9s16 = vaddq_s16(q1s16, q6s16);
-  *q10s16 = vaddq_s16(q2s16, q5s16);
-  *q11s16 = vaddq_s16(q3s16, q4s16);
-  *q12s16 = vsubq_s16(q3s16, q4s16);
-  *q13s16 = vsubq_s16(q2s16, q5s16);
-  *q14s16 = vsubq_s16(q1s16, q6s16);
-  *q15s16 = vsubq_s16(q0s16, q7s16);
+static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
+                          int16x8_t a3, int16x8_t a4, int16x8_t a5,
+                          int16x8_t a6, int16x8_t a7, uint8_t *dest,
+                          const int stride) {
+  const uint8_t *dst = dest;
+  uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+  uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
+
+  a0 = vrshrq_n_s16(a0, 5);
+  a1 = vrshrq_n_s16(a1, 5);
+  a2 = vrshrq_n_s16(a2, 5);
+  a3 = vrshrq_n_s16(a3, 5);
+  a4 = vrshrq_n_s16(a4, 5);
+  a5 = vrshrq_n_s16(a5, 5);
+  a6 = vrshrq_n_s16(a6, 5);
+  a7 = vrshrq_n_s16(a7, 5);
+
+  d0 = vld1_u8(dst);
+  dst += stride;
+  d1 = vld1_u8(dst);
+  dst += stride;
+  d2 = vld1_u8(dst);
+  dst += stride;
+  d3 = vld1_u8(dst);
+  dst += stride;
+  d4 = vld1_u8(dst);
+  dst += stride;
+  d5 = vld1_u8(dst);
+  dst += stride;
+  d6 = vld1_u8(dst);
+  dst += stride;
+  d7 = vld1_u8(dst);
+
+  d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0);
+  d1_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), d1);
+  d2_u16 = vaddw_u8(vreinterpretq_u16_s16(a2), d2);
+  d3_u16 = vaddw_u8(vreinterpretq_u16_s16(a3), d3);
+  d4_u16 = vaddw_u8(vreinterpretq_u16_s16(a4), d4);
+  d5_u16 = vaddw_u8(vreinterpretq_u16_s16(a5), d5);
+  d6_u16 = vaddw_u8(vreinterpretq_u16_s16(a6), d6);
+  d7_u16 = vaddw_u8(vreinterpretq_u16_s16(a7), d7);
+
+  d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16));
+  d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16));
+  d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16));
+  d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16));
+  d4 = vqmovun_s16(vreinterpretq_s16_u16(d4_u16));
+  d5 = vqmovun_s16(vreinterpretq_s16_u16(d5_u16));
+  d6 = vqmovun_s16(vreinterpretq_s16_u16(d6_u16));
+  d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16));
+
+  vst1_u8(dest, d0);
+  dest += stride;
+  vst1_u8(dest, d1);
+  dest += stride;
+  vst1_u8(dest, d2);
+  dest += stride;
+  vst1_u8(dest, d3);
+  dest += stride;
+  vst1_u8(dest, d4);
+  dest += stride;
+  vst1_u8(dest, d5);
+  dest += stride;
+  vst1_u8(dest, d6);
+  dest += stride;
+  vst1_u8(dest, d7);
 }
 
 void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
-                             int dest_stride) {
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
-  q8s16 = load_tran_low_to_s16(input);
-  q9s16 = load_tran_low_to_s16(input + 8);
-  q10s16 = load_tran_low_to_s16(input + 16);
-  q11s16 = load_tran_low_to_s16(input + 24);
-  q12s16 = load_tran_low_to_s16(input + 32);
-  q13s16 = load_tran_low_to_s16(input + 40);
-  q14s16 = load_tran_low_to_s16(input + 48);
-  q15s16 = load_tran_low_to_s16(input + 56);
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  d1 = d2 = dest;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-
-  q8s16 = q12s16;
-  q9s16 = q13s16;
-  q10s16 = q14s16;
-  q11s16 = q15s16;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
+                             int stride) {
+  const int16x8_t cospis = vld1q_s16(kCospi);
+  const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+  const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+  int16x8_t a0 = load_tran_low_to_s16q(input);
+  int16x8_t a1 = load_tran_low_to_s16q(input + 8);
+  int16x8_t a2 = load_tran_low_to_s16q(input + 16);
+  int16x8_t a3 = load_tran_low_to_s16q(input + 24);
+  int16x8_t a4 = load_tran_low_to_s16q(input + 32);
+  int16x8_t a5 = load_tran_low_to_s16q(input + 40);
+  int16x8_t a6 = load_tran_low_to_s16q(input + 48);
+  int16x8_t a7 = load_tran_low_to_s16q(input + 56);
+
+  idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+  idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+  add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride);
 }
 
 void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
-                             int dest_stride) {
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
-  int16x4_t d26s16, d27s16, d28s16, d29s16;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-  int32x4_t q9s32, q10s32, q11s32, q12s32;
-
-  q8s16 = load_tran_low_to_s16(input);
-  q9s16 = load_tran_low_to_s16(input + 8);
-  q10s16 = load_tran_low_to_s16(input + 16);
-  q11s16 = load_tran_low_to_s16(input + 24);
-  q12s16 = load_tran_low_to_s16(input + 32);
-  q13s16 = load_tran_low_to_s16(input + 40);
-  q14s16 = load_tran_low_to_s16(input + 48);
-  q15s16 = load_tran_low_to_s16(input + 56);
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  // First transform rows
-  // stage 1
-  q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
-  q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
-
-  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-
-  q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2);
-
-  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
-  q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2);
-
-  q5s16 = vqrdmulhq_s16(q11s16, q0s16);
-
-  q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
-
-  q6s16 = vqrdmulhq_s16(q11s16, q1s16);
-
-  // stage 2 & stage 3 - even half
-  q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2);
-
-  q9s16 = vqrdmulhq_s16(q8s16, q0s16);
-
-  q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2);
-
-  q13s16 = vqrdmulhq_s16(q10s16, q1s16);
-
-  q15s16 = vqrdmulhq_s16(q10s16, q0s16);
-
-  // stage 3 -odd half
-  q0s16 = vaddq_s16(q9s16, q15s16);
-  q1s16 = vaddq_s16(q9s16, q13s16);
-  q2s16 = vsubq_s16(q9s16, q13s16);
-  q3s16 = vsubq_s16(q9s16, q15s16);
-
-  // stage 2 - odd half
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 4
-  q8s16 = vaddq_s16(q0s16, q7s16);
-  q9s16 = vaddq_s16(q1s16, q6s16);
-  q10s16 = vaddq_s16(q2s16, q5s16);
-  q11s16 = vaddq_s16(q3s16, q4s16);
-  q12s16 = vsubq_s16(q3s16, q4s16);
-  q13s16 = vsubq_s16(q2s16, q5s16);
-  q14s16 = vsubq_s16(q1s16, q6s16);
-  q15s16 = vsubq_s16(q0s16, q7s16);
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  d1 = d2 = dest;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-
-  q8s16 = q12s16;
-  q9s16 = q13s16;
-  q10s16 = q14s16;
-  q11s16 = q15s16;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
+                             int stride) {
+  const int16x8_t cospis = vld1q_s16(kCospi);
+  const int16x8_t cospisd = vaddq_s16(cospis, cospis);
+  const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
+  const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
+  const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
+  int16x4_t a0, a1, a2, a3, a4, a5, a6, a7;
+  int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+  a0 = load_tran_low_to_s16d(input);
+  a1 = load_tran_low_to_s16d(input + 8);
+  a2 = load_tran_low_to_s16d(input + 16);
+  a3 = load_tran_low_to_s16d(input + 24);
+
+  idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4,
+                       &a5, &a6, &a7);
+  idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a0, a1, a2, a3, a4, a5, a6,
+                       a7, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
+  add8x8(b0, b1, b2, b3, b4, b5, b6, b7, dest, stride);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm
index f39e8ddd4b4..5dd9bdc7888 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm
@@ -10,8 +10,9 @@
 
     INCLUDE ./vpx_config.asm
 
-    ; Helper function used to load tran_low_t into int16, narrowing if
+    ; Helper functions used to load tran_low_t into int16, narrowing if
     ; necessary.
+
     ; $dst0..3 are d registers with the pairs assumed to be contiguous in
     ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth.
     MACRO
@@ -27,4 +28,19 @@
     vld1.s16        {$dst0-$dst1,$dst2-$dst3}, [$src]!
     ENDIF
     MEND
+
+    ; $dst0..3 are d registers. q0-q3 are used as temporaries in high-bitdepth.
+    MACRO
+    LOAD_TRAN_LOW_TO_S16X2 $dst0, $dst1, $dst2, $dst3, $src
+    IF CONFIG_VP9_HIGHBITDEPTH
+    vld2.s32        {q0,q1}, [$src]!
+    vld2.s32        {q2,q3}, [$src]!
+    vmovn.i32       $dst0, q0
+    vmovn.i32       $dst1, q2
+    vmovn.i32       $dst2, q1
+    vmovn.i32       $dst3, q3
+    ELSE
+    vld2.s16        {$dst0,$dst1,$dst2,$dst3}, [$src]!
+    ENDIF
+    MEND
     END
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
index 5c2a53c034f..d9b85223c76 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
@@ -17,10 +17,45 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
+DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
+  16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
+  11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
+  16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
+  -9102 /* -cospi_20_64 */, 3196 /*  cospi_28_64 */,
+  16305 /*  cospi_2_64  */, 1606 /*  cospi_30_64 */,
+  14449 /*  cospi_10_64 */, 7723 /*  cospi_22_64 */,
+  15679 /*  cospi_6_64  */, -4756 /* -cospi_26_64 */,
+  12665 /*  cospi_14_64 */, -10394 /* -cospi_18_64 */
+};
+
+DECLARE_ALIGNED(16, static const int32_t, kCospi32[8]) = {
+  16384 /*  cospi_0_64  */, 15137 /* cospi_8_64  */,
+  11585 /*  cospi_16_64 */, 6270 /* cospi_24_64 */,
+  16069 /*  cospi_4_64  */, 13623 /* cospi_12_64 */,
+  -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */
+};
+
 //------------------------------------------------------------------------------
+// Helper functions used to load tran_low_t into int16, narrowing if necessary.
 
-// Helper function used to load tran_low_t into int16, narrowing if necessary.
-static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) {
+static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4x2_t v0 = vld2q_s32(buf);
+  const int32x4x2_t v1 = vld2q_s32(buf + 8);
+  const int16x4_t s0 = vmovn_s32(v0.val[0]);
+  const int16x4_t s1 = vmovn_s32(v0.val[1]);
+  const int16x4_t s2 = vmovn_s32(v1.val[0]);
+  const int16x4_t s3 = vmovn_s32(v1.val[1]);
+  int16x8x2_t res;
+  res.val[0] = vcombine_s16(s0, s2);
+  res.val[1] = vcombine_s16(s1, s3);
+  return res;
+#else
+  return vld2q_s16(buf);
+#endif
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
 #if CONFIG_VP9_HIGHBITDEPTH
   const int32x4_t v0 = vld1q_s32(buf);
   const int32x4_t v1 = vld1q_s32(buf + 4);
@@ -32,6 +67,17 @@ static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) {
 #endif
 }
 
+static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vld1q_s32(buf);
+  return vmovn_s32(v0);
+#else
+  return vld1_s16(buf);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
 // Multiply a by a_const. Saturate, shift and narrow by 14.
 static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
                                                       const int16_t a_const) {
@@ -85,30 +131,6 @@ static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
   return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
 }
 
-static INLINE void load_and_transpose_s16_8x8(const int16_t *a, int a_stride,
-                                              int16x8_t *a0, int16x8_t *a1,
-                                              int16x8_t *a2, int16x8_t *a3,
-                                              int16x8_t *a4, int16x8_t *a5,
-                                              int16x8_t *a6, int16x8_t *a7) {
-  *a0 = vld1q_s16(a);
-  a += a_stride;
-  *a1 = vld1q_s16(a);
-  a += a_stride;
-  *a2 = vld1q_s16(a);
-  a += a_stride;
-  *a3 = vld1q_s16(a);
-  a += a_stride;
-  *a4 = vld1q_s16(a);
-  a += a_stride;
-  *a5 = vld1q_s16(a);
-  a += a_stride;
-  *a6 = vld1q_s16(a);
-  a += a_stride;
-  *a7 = vld1q_s16(a);
-
-  transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
-}
-
 // Shift the output down by 6 and add it to the destination buffer.
 static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
                                         const int16x8_t a2, const int16x8_t a3,
@@ -169,4 +191,354 @@ static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
   b += b_stride;
   vst1_u8(b, b7);
 }
+
+static INLINE uint8x16_t create_dcq(const int16_t dc) {
+  // Clip both sides and gcc may compile to assembly 'usat'.
+  const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc);
+  return vdupq_n_u8((uint8_t)t);
+}
+
+static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
+                                         int16x8_t *const a0,
+                                         int16x8_t *const a1) {
+  int16x4_t b0, b1, b2, b3;
+  int32x4_t c0, c1, c2, c3;
+  int16x8_t d0, d1;
+
+  transpose_s16_4x4q(a0, a1);
+  b0 = vget_low_s16(*a0);
+  b1 = vget_high_s16(*a0);
+  b2 = vget_low_s16(*a1);
+  b3 = vget_high_s16(*a1);
+  c0 = vmull_lane_s16(b0, cospis, 2);
+  c2 = vmull_lane_s16(b1, cospis, 2);
+  c1 = vsubq_s32(c0, c2);
+  c0 = vaddq_s32(c0, c2);
+  c2 = vmull_lane_s16(b2, cospis, 3);
+  c3 = vmull_lane_s16(b2, cospis, 1);
+  c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
+  c3 = vmlal_lane_s16(c3, b3, cospis, 3);
+  b0 = vrshrn_n_s32(c0, 14);
+  b1 = vrshrn_n_s32(c1, 14);
+  b2 = vrshrn_n_s32(c2, 14);
+  b3 = vrshrn_n_s32(c3, 14);
+  d0 = vcombine_s16(b0, b1);
+  d1 = vcombine_s16(b3, b2);
+  *a0 = vaddq_s16(d0, d1);
+  *a1 = vsubq_s16(d0, d1);
+}
+
+static INLINE void idct8x8_12_pass1_bd8(
+    const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
+    int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2,
+    int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5,
+    int16x4_t *const io6, int16x4_t *const io7) {
+  int16x4_t step1[8], step2[8];
+  int32x4_t t32[2];
+
+  transpose_s16_4x4d(io0, io1, io2, io3);
+
+  // stage 1
+  step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3);
+  step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2);
+  step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1);
+  step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0);
+
+  // stage 2
+  step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2);
+  step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3);
+  step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1);
+
+  step2[4] = vadd_s16(step1[4], step1[5]);
+  step2[5] = vsub_s16(step1[4], step1[5]);
+  step2[6] = vsub_s16(step1[7], step1[6]);
+  step2[7] = vadd_s16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vadd_s16(step2[1], step2[3]);
+  step1[1] = vadd_s16(step2[1], step2[2]);
+  step1[2] = vsub_s16(step2[1], step2[2]);
+  step1[3] = vsub_s16(step2[1], step2[3]);
+
+  t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
+  t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
+  t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
+  step1[5] = vrshrn_n_s32(t32[0], 14);
+  step1[6] = vrshrn_n_s32(t32[1], 14);
+
+  // stage 4
+  *io0 = vadd_s16(step1[0], step2[7]);
+  *io1 = vadd_s16(step1[1], step1[6]);
+  *io2 = vadd_s16(step1[2], step1[5]);
+  *io3 = vadd_s16(step1[3], step2[4]);
+  *io4 = vsub_s16(step1[3], step2[4]);
+  *io5 = vsub_s16(step1[2], step1[5]);
+  *io6 = vsub_s16(step1[1], step1[6]);
+  *io7 = vsub_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_pass2_bd8(
+    const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
+    const int16x4_t input0, const int16x4_t input1, const int16x4_t input2,
+    const int16x4_t input3, const int16x4_t input4, const int16x4_t input5,
+    const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0,
+    int16x8_t *const output1, int16x8_t *const output2,
+    int16x8_t *const output3, int16x8_t *const output4,
+    int16x8_t *const output5, int16x8_t *const output6,
+    int16x8_t *const output7) {
+  int16x8_t in[4];
+  int16x8_t step1[8], step2[8];
+  int32x4_t t32[8];
+  int16x4_t t16[8];
+
+  transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6,
+                    input7, &in[0], &in[1], &in[2], &in[3]);
+
+  // stage 1
+  step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3);
+  step1[5] = vqrdmulhq_lane_s16(in[3], cospisd1, 2);
+  step1[6] = vqrdmulhq_lane_s16(in[3], cospisd1, 1);
+  step1[7] = vqrdmulhq_lane_s16(in[1], cospisd1, 0);
+
+  // stage 2
+  step2[1] = vqrdmulhq_lane_s16(in[0], cospisd0, 2);
+  step2[2] = vqrdmulhq_lane_s16(in[2], cospisd0, 3);
+  step2[3] = vqrdmulhq_lane_s16(in[2], cospisd0, 1);
+
+  step2[4] = vaddq_s16(step1[4], step1[5]);
+  step2[5] = vsubq_s16(step1[4], step1[5]);
+  step2[6] = vsubq_s16(step1[7], step1[6]);
+  step2[7] = vaddq_s16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s16(step2[1], step2[3]);
+  step1[1] = vaddq_s16(step2[1], step2[2]);
+  step1[2] = vsubq_s16(step2[1], step2[2]);
+  step1[3] = vsubq_s16(step2[1], step2[3]);
+
+  t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
+  t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
+  t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+  t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+  t16[0] = vrshrn_n_s32(t32[0], 14);
+  t16[1] = vrshrn_n_s32(t32[1], 14);
+  t16[2] = vrshrn_n_s32(t32[2], 14);
+  t16[3] = vrshrn_n_s32(t32[3], 14);
+  step1[5] = vcombine_s16(t16[0], t16[1]);
+  step1[6] = vcombine_s16(t16[2], t16[3]);
+
+  // stage 4
+  *output0 = vaddq_s16(step1[0], step2[7]);
+  *output1 = vaddq_s16(step1[1], step1[6]);
+  *output2 = vaddq_s16(step1[2], step1[5]);
+  *output3 = vaddq_s16(step1[3], step2[4]);
+  *output4 = vsubq_s16(step1[3], step2[4]);
+  *output5 = vsubq_s16(step1[2], step1[5]);
+  *output6 = vsubq_s16(step1[1], step1[6]);
+  *output7 = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
+                                     const int16x4_t cospis1,
+                                     int16x8_t *const io0, int16x8_t *const io1,
+                                     int16x8_t *const io2, int16x8_t *const io3,
+                                     int16x8_t *const io4, int16x8_t *const io5,
+                                     int16x8_t *const io6,
+                                     int16x8_t *const io7) {
+  int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
+      input_7l, input_7h;
+  int16x4_t step1l[4], step1h[4];
+  int16x8_t step1[8], step2[8];
+  int32x4_t t32[8];
+  int16x4_t t16[8];
+
+  transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7);
+
+  // stage 1
+  input_1l = vget_low_s16(*io1);
+  input_1h = vget_high_s16(*io1);
+  input_3l = vget_low_s16(*io3);
+  input_3h = vget_high_s16(*io3);
+  input_5l = vget_low_s16(*io5);
+  input_5h = vget_high_s16(*io5);
+  input_7l = vget_low_s16(*io7);
+  input_7h = vget_high_s16(*io7);
+  step1l[0] = vget_low_s16(*io0);
+  step1h[0] = vget_high_s16(*io0);
+  step1l[1] = vget_low_s16(*io2);
+  step1h[1] = vget_high_s16(*io2);
+  step1l[2] = vget_low_s16(*io4);
+  step1h[2] = vget_high_s16(*io4);
+  step1l[3] = vget_low_s16(*io6);
+  step1h[3] = vget_high_s16(*io6);
+
+  t32[0] = vmull_lane_s16(input_1l, cospis1, 3);
+  t32[1] = vmull_lane_s16(input_1h, cospis1, 3);
+  t32[2] = vmull_lane_s16(input_3l, cospis1, 2);
+  t32[3] = vmull_lane_s16(input_3h, cospis1, 2);
+  t32[4] = vmull_lane_s16(input_3l, cospis1, 1);
+  t32[5] = vmull_lane_s16(input_3h, cospis1, 1);
+  t32[6] = vmull_lane_s16(input_1l, cospis1, 0);
+  t32[7] = vmull_lane_s16(input_1h, cospis1, 0);
+  t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0);
+  t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0);
+  t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1);
+  t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1);
+  t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2);
+  t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
+  t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
+  t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
+  t16[0] = vrshrn_n_s32(t32[0], 14);
+  t16[1] = vrshrn_n_s32(t32[1], 14);
+  t16[2] = vrshrn_n_s32(t32[2], 14);
+  t16[3] = vrshrn_n_s32(t32[3], 14);
+  t16[4] = vrshrn_n_s32(t32[4], 14);
+  t16[5] = vrshrn_n_s32(t32[5], 14);
+  t16[6] = vrshrn_n_s32(t32[6], 14);
+  t16[7] = vrshrn_n_s32(t32[7], 14);
+  step1[4] = vcombine_s16(t16[0], t16[1]);
+  step1[5] = vcombine_s16(t16[2], t16[3]);
+  step1[6] = vcombine_s16(t16[4], t16[5]);
+  step1[7] = vcombine_s16(t16[6], t16[7]);
+
+  // stage 2
+  t32[2] = vmull_lane_s16(step1l[0], cospis0, 2);
+  t32[3] = vmull_lane_s16(step1h[0], cospis0, 2);
+  t32[4] = vmull_lane_s16(step1l[1], cospis0, 3);
+  t32[5] = vmull_lane_s16(step1h[1], cospis0, 3);
+  t32[6] = vmull_lane_s16(step1l[1], cospis0, 1);
+  t32[7] = vmull_lane_s16(step1h[1], cospis0, 1);
+  t32[0] = vmlal_lane_s16(t32[2], step1l[2], cospis0, 2);
+  t32[1] = vmlal_lane_s16(t32[3], step1h[2], cospis0, 2);
+  t32[2] = vmlsl_lane_s16(t32[2], step1l[2], cospis0, 2);
+  t32[3] = vmlsl_lane_s16(t32[3], step1h[2], cospis0, 2);
+  t32[4] = vmlsl_lane_s16(t32[4], step1l[3], cospis0, 1);
+  t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
+  t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
+  t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
+  t16[0] = vrshrn_n_s32(t32[0], 14);
+  t16[1] = vrshrn_n_s32(t32[1], 14);
+  t16[2] = vrshrn_n_s32(t32[2], 14);
+  t16[3] = vrshrn_n_s32(t32[3], 14);
+  t16[4] = vrshrn_n_s32(t32[4], 14);
+  t16[5] = vrshrn_n_s32(t32[5], 14);
+  t16[6] = vrshrn_n_s32(t32[6], 14);
+  t16[7] = vrshrn_n_s32(t32[7], 14);
+  step2[0] = vcombine_s16(t16[0], t16[1]);
+  step2[1] = vcombine_s16(t16[2], t16[3]);
+  step2[2] = vcombine_s16(t16[4], t16[5]);
+  step2[3] = vcombine_s16(t16[6], t16[7]);
+
+  step2[4] = vaddq_s16(step1[4], step1[5]);
+  step2[5] = vsubq_s16(step1[4], step1[5]);
+  step2[6] = vsubq_s16(step1[7], step1[6]);
+  step2[7] = vaddq_s16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s16(step2[0], step2[3]);
+  step1[1] = vaddq_s16(step2[1], step2[2]);
+  step1[2] = vsubq_s16(step2[1], step2[2]);
+  step1[3] = vsubq_s16(step2[0], step2[3]);
+
+  t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
+  t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
+  t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+  t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+  t16[0] = vrshrn_n_s32(t32[0], 14);
+  t16[1] = vrshrn_n_s32(t32[1], 14);
+  t16[2] = vrshrn_n_s32(t32[2], 14);
+  t16[3] = vrshrn_n_s32(t32[3], 14);
+  step1[5] = vcombine_s16(t16[0], t16[1]);
+  step1[6] = vcombine_s16(t16[2], t16[3]);
+
+  // stage 4
+  *io0 = vaddq_s16(step1[0], step2[7]);
+  *io1 = vaddq_s16(step1[1], step1[6]);
+  *io2 = vaddq_s16(step1[2], step1[5]);
+  *io3 = vaddq_s16(step1[3], step2[4]);
+  *io4 = vsubq_s16(step1[3], step2[4]);
+  *io5 = vsubq_s16(step1[2], step1[5]);
+  *io6 = vsubq_s16(step1[1], step1[6]);
+  *io7 = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
+                                              int16x8_t *const d0,
+                                              int16x8_t *const d1) {
+  int16x4_t t16[4];
+
+  t16[0] = vrshrn_n_s32(t32[0], 14);
+  t16[1] = vrshrn_n_s32(t32[1], 14);
+  t16[2] = vrshrn_n_s32(t32[2], 14);
+  t16[3] = vrshrn_n_s32(t32[3], 14);
+  *d0 = vcombine_s16(t16[0], t16[1]);
+  *d1 = vcombine_s16(t16[2], t16[3]);
+}
+
+static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
+                                            const int16x8_t s1,
+                                            const int16x4_t cospi_0_8_16_24,
+                                            int32x4_t *const t32) {
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1);
+}
+
+static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
+                                     const int16x4_t cospi_0_8_16_24,
+                                     int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x4_t cospi_0_8_16_24,
+                                         int16x8_t *const d0,
+                                         int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+  t32[2] = vnegq_s32(t32[2]);
+  t32[3] = vnegq_s32(t32[3]);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x4_t cospi_0_8_16_24,
+                                      int16x8_t *const d0,
+                                      int16x8_t *const d1) {
+  int32x4_t t32[6];
+
+  t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2);
+  t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2);
+  t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+  t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+  t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+  t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
+                                    const int stride) {
+  uint8x8_t d = vld1_u8(*dest);
+  uint16x8_t q;
+
+  res = vrshrq_n_s16(res, 6);
+  q = vaddw_u8(vreinterpretq_u16_s16(res), d);
+  d = vqmovun_s16(vreinterpretq_s16_u16(q));
+  vst1_u8(*dest, d);
+  *dest += stride;
+}
+
 #endif  // VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c
index e150a5302d5..fb1fa6b681d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c
@@ -346,20 +346,54 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   vst1q_u8(dst, above_right);
 }
 
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t A0_0 = vld1q_u8(above);
+  const uint8x16_t A0_1 = vld1q_u8(above + 16);
+  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7);
+  const uint8x16_t A1_0 = vld1q_u8(above + 1);
+  const uint8x16_t A1_1 = vld1q_u8(above + 17);
+  const uint8x16_t A2_0 = vld1q_u8(above + 2);
+  const uint8x16_t A2_1 = vld1q_u8(above + 18);
+  const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0);
+  const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1);
+  uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0);
+  uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1);
+  int i;
+  (void)left;
+
+  vst1q_u8(dst, row_0);
+  dst += 16;
+  vst1q_u8(dst, row_1);
+  dst += stride - 16;
+
+  for (i = 0; i < 30; ++i) {
+    row_0 = vextq_u8(row_0, row_1, 1);
+    row_1 = vextq_u8(row_1, above_right, 1);
+    vst1q_u8(dst, row_0);
+    dst += 16;
+    vst1q_u8(dst, row_1);
+    dst += stride - 16;
+  }
+
+  vst1q_u8(dst, above_right);
+  dst += 16;
+  vst1q_u8(dst, row_1);
+}
+
 // -----------------------------------------------------------------------------
 
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t XABCD = vld1_u8(above - 1);
-  const uint32x2_t zero = vdup_n_u32(0);
-  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
-  const uint8x8_t LKJI = vrev64_u8(vreinterpret_u8_u32(IJKL));
-  const uint8x8_t LKJIXABC = vext_u8(LKJI, XABCD, 4);
-  const uint8x8_t KJIXABCD = vext_u8(LKJI, XABCD, 5);
-  const uint8x8_t JIXABCD0 =
-      vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(KJIXABCD), 8));
-  const uint8x8_t avg1 = vhadd_u8(JIXABCD0, LKJIXABC);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABCD);
+  const uint8x8_t XA0123 = vld1_u8(above - 1);
+  const uint8x8_t L0123 = vld1_u8(left);
+  const uint8x8_t L3210 = vrev64_u8(L0123);
+  const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4);
+  const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5);
+  const uint8x8_t L10XA0123_ =
+      vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8));
+  const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123);
   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
@@ -374,6 +408,265 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   vst1_lane_u32((uint32_t *)dst, r3, 0);
 }
 
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t XA0123456 = vld1_u8(above - 1);
+  const uint8x8_t A01234567 = vld1_u8(above);
+  const uint8x8_t A1234567_ = vld1_u8(above + 1);
+  const uint8x8_t L01234567 = vld1_u8(left);
+  const uint8x8_t L76543210 = vrev64_u8(L01234567);
+  const uint8x8_t L6543210X = vext_u8(L76543210, XA0123456, 1);
+  const uint8x8_t L543210XA0 = vext_u8(L76543210, XA0123456, 2);
+  const uint8x16_t L76543210XA0123456 = vcombine_u8(L76543210, XA0123456);
+  const uint8x16_t L6543210XA01234567 = vcombine_u8(L6543210X, A01234567);
+  const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_);
+  const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_);
+  const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567);
+  const uint8x8_t row_0 = vget_low_u8(row);
+  const uint8x8_t row_1 = vget_high_u8(row);
+  const uint8x8_t r0 = vext_u8(row_0, row_1, 7);
+  const uint8x8_t r1 = vext_u8(row_0, row_1, 6);
+  const uint8x8_t r2 = vext_u8(row_0, row_1, 5);
+  const uint8x8_t r3 = vext_u8(row_0, row_1, 4);
+  const uint8x8_t r4 = vext_u8(row_0, row_1, 3);
+  const uint8x8_t r5 = vext_u8(row_0, row_1, 2);
+  const uint8x8_t r6 = vext_u8(row_0, row_1, 1);
+
+  vst1_u8(dst, r0);
+  dst += stride;
+  vst1_u8(dst, r1);
+  dst += stride;
+  vst1_u8(dst, r2);
+  dst += stride;
+  vst1_u8(dst, r3);
+  dst += stride;
+  vst1_u8(dst, r4);
+  dst += stride;
+  vst1_u8(dst, r5);
+  dst += stride;
+  vst1_u8(dst, r6);
+  dst += stride;
+  vst1_u8(dst, row_0);
+}
+
+static INLINE void d135_store_16x8(
+    uint8_t **dst, const ptrdiff_t stride, const uint8x16_t row_0,
+    const uint8x16_t row_1, const uint8x16_t row_2, const uint8x16_t row_3,
+    const uint8x16_t row_4, const uint8x16_t row_5, const uint8x16_t row_6,
+    const uint8x16_t row_7) {
+  vst1q_u8(*dst, row_0);
+  *dst += stride;
+  vst1q_u8(*dst, row_1);
+  *dst += stride;
+  vst1q_u8(*dst, row_2);
+  *dst += stride;
+  vst1q_u8(*dst, row_3);
+  *dst += stride;
+  vst1q_u8(*dst, row_4);
+  *dst += stride;
+  vst1q_u8(*dst, row_5);
+  *dst += stride;
+  vst1q_u8(*dst, row_6);
+  *dst += stride;
+  vst1q_u8(*dst, row_7);
+  *dst += stride;
+}
+
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t XA0123456789abcde = vld1q_u8(above - 1);
+  const uint8x16_t A0123456789abcdef = vld1q_u8(above);
+  const uint8x16_t A123456789abcdef_ = vld1q_u8(above + 1);
+  const uint8x16_t L0123456789abcdef = vld1q_u8(left);
+  const uint8x8_t L76543210 = vrev64_u8(vget_low_u8(L0123456789abcdef));
+  const uint8x8_t Lfedcba98 = vrev64_u8(vget_high_u8(L0123456789abcdef));
+  const uint8x16_t Lfedcba9876543210 = vcombine_u8(Lfedcba98, L76543210);
+  const uint8x16_t Ledcba9876543210X =
+      vextq_u8(Lfedcba9876543210, XA0123456789abcde, 1);
+  const uint8x16_t Ldcba9876543210XA0 =
+      vextq_u8(Lfedcba9876543210, XA0123456789abcde, 2);
+  const uint8x16_t avg_0 = vhaddq_u8(Lfedcba9876543210, Ldcba9876543210XA0);
+  const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_);
+  const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X);
+  const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef);
+  const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
+  const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14);
+  const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13);
+  const uint8x16_t r_3 = vextq_u8(row_0, row_1, 12);
+  const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11);
+  const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10);
+  const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9);
+  const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1));
+  const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7);
+  const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6);
+  const uint8x16_t r_a = vextq_u8(row_0, row_1, 5);
+  const uint8x16_t r_b = vextq_u8(row_0, row_1, 4);
+  const uint8x16_t r_c = vextq_u8(row_0, row_1, 3);
+  const uint8x16_t r_d = vextq_u8(row_0, row_1, 2);
+  const uint8x16_t r_e = vextq_u8(row_0, row_1, 1);
+
+  d135_store_16x8(&dst, stride, r_0, r_1, r_2, r_3, r_4, r_5, r_6, r_7);
+  d135_store_16x8(&dst, stride, r_8, r_9, r_a, r_b, r_c, r_d, r_e, row_0);
+}
+
+static INLINE void d135_store_32x2(uint8_t **dst, const ptrdiff_t stride,
+                                   const uint8x16_t row_0,
+                                   const uint8x16_t row_1,
+                                   const uint8x16_t row_2) {
+  uint8_t *dst2 = *dst;
+  vst1q_u8(dst2, row_1);
+  dst2 += 16;
+  vst1q_u8(dst2, row_2);
+  dst2 += 16 * stride - 16;
+  vst1q_u8(dst2, row_0);
+  dst2 += 16;
+  vst1q_u8(dst2, row_1);
+  *dst += stride;
+}
+
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t LL0123456789abcdef = vld1q_u8(left + 16);
+  const uint8x16_t LU0123456789abcdef = vld1q_u8(left);
+  const uint8x8_t LL76543210 = vrev64_u8(vget_low_u8(LL0123456789abcdef));
+  const uint8x8_t LU76543210 = vrev64_u8(vget_low_u8(LU0123456789abcdef));
+  const uint8x8_t LLfedcba98 = vrev64_u8(vget_high_u8(LL0123456789abcdef));
+  const uint8x8_t LUfedcba98 = vrev64_u8(vget_high_u8(LU0123456789abcdef));
+  const uint8x16_t LLfedcba9876543210 = vcombine_u8(LLfedcba98, LL76543210);
+  const uint8x16_t LUfedcba9876543210 = vcombine_u8(LUfedcba98, LU76543210);
+  const uint8x16_t LLedcba9876543210Uf =
+      vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 1);
+  const uint8x16_t LLdcba9876543210Ufe =
+      vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 2);
+  const uint8x16_t avg_0 = vhaddq_u8(LLfedcba9876543210, LLdcba9876543210Ufe);
+  const uint8x16_t row_0 = vrhaddq_u8(avg_0, LLedcba9876543210Uf);
+
+  const uint8x16_t XAL0123456789abcde = vld1q_u8(above - 1);
+  const uint8x16_t LUedcba9876543210X =
+      vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 1);
+  const uint8x16_t LUdcba9876543210XA0 =
+      vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 2);
+  const uint8x16_t avg_1 = vhaddq_u8(LUfedcba9876543210, LUdcba9876543210XA0);
+  const uint8x16_t row_1 = vrhaddq_u8(avg_1, LUedcba9876543210X);
+
+  const uint8x16_t AL0123456789abcdef = vld1q_u8(above);
+  const uint8x16_t AL123456789abcdefg = vld1q_u8(above + 1);
+  const uint8x16_t ALfR0123456789abcde = vld1q_u8(above + 15);
+  const uint8x16_t AR0123456789abcdef = vld1q_u8(above + 16);
+  const uint8x16_t AR123456789abcdef_ = vld1q_u8(above + 17);
+  const uint8x16_t avg_2 = vhaddq_u8(XAL0123456789abcde, AL123456789abcdefg);
+  const uint8x16_t row_2 = vrhaddq_u8(avg_2, AL0123456789abcdef);
+  const uint8x16_t avg_3 = vhaddq_u8(ALfR0123456789abcde, AR123456789abcdef_);
+  const uint8x16_t row_3 = vrhaddq_u8(avg_3, AR0123456789abcdef);
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 15);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 15);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 14);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 14);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 14);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 13);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 13);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 13);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 12);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 12);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 12);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 11);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 11);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 11);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 10);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 10);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 10);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 9);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 9);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 9);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 8);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 8);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 8);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 7);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 7);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 7);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 6);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 6);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 6);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 5);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 5);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 5);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 4);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 4);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 4);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 3);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 3);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 3);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 2);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 2);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 2);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 1);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 1);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 1);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  d135_store_32x2(&dst, stride, row_0, row_1, row_2);
+}
+
 // -----------------------------------------------------------------------------
 
 #if !HAVE_NEON_ASM
@@ -483,133 +776,98 @@ void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
   vst1_u8(dst, d);
 }
 
+static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride,
+                                const uint8x8_t left) {
+  const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+  const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+  const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+  const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+  const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+  const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+  const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+  const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+  vst1q_u8(*dst, row_0);
+  *dst += stride;
+  vst1q_u8(*dst, row_1);
+  *dst += stride;
+  vst1q_u8(*dst, row_2);
+  *dst += stride;
+  vst1q_u8(*dst, row_3);
+  *dst += stride;
+  vst1q_u8(*dst, row_4);
+  *dst += stride;
+  vst1q_u8(*dst, row_5);
+  *dst += stride;
+  vst1q_u8(*dst, row_6);
+  *dst += stride;
+  vst1q_u8(*dst, row_7);
+  *dst += stride;
+}
+
 void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const uint8x16_t left_u8q = vld1q_u8(left);
-  uint8x8_t left_u8d = vget_low_u8(left_u8q);
-  uint8x16_t d;
-  int i;
   (void)above;
 
-  for (i = 0; i < 2; i++, left_u8d = vget_high_u8(left_u8q)) {
-    d = vdupq_lane_u8(left_u8d, 0);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 1);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 2);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 3);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 4);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 5);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 6);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 7);
-    vst1q_u8(dst, d);
-    dst += stride;
-  }
+  h_store_16x8(&dst, stride, vget_low_u8(left_u8q));
+  h_store_16x8(&dst, stride, vget_high_u8(left_u8q));
+}
+
+static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride,
+                                const uint8x8_t left) {
+  const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+  const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+  const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+  const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+  const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+  const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+  const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+  const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+  vst1q_u8(*dst, row_0);  // Note clang-3.8 produced poor code w/vst2q_u8
+  *dst += 16;
+  vst1q_u8(*dst, row_0);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_1);
+  *dst += 16;
+  vst1q_u8(*dst, row_1);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_2);
+  *dst += 16;
+  vst1q_u8(*dst, row_2);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_3);
+  *dst += 16;
+  vst1q_u8(*dst, row_3);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_4);
+  *dst += 16;
+  vst1q_u8(*dst, row_4);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_5);
+  *dst += 16;
+  vst1q_u8(*dst, row_5);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_6);
+  *dst += 16;
+  vst1q_u8(*dst, row_6);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_7);
+  *dst += 16;
+  vst1q_u8(*dst, row_7);
+  *dst += stride - 16;
 }
 
 void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  uint8x16_t d;
   int i;
   (void)above;
 
   for (i = 0; i < 2; i++, left += 16) {
     const uint8x16_t left_u8 = vld1q_u8(left);
-    const uint8x8_t left_low = vget_low_u8(left_u8);
-    const uint8x8_t left_high = vget_high_u8(left_u8);
-    d = vdupq_lane_u8(left_low, 0);
-    vst1q_u8(dst, d);  // Note clang-3.8 produced poor code w/vst2q_u8
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 1);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 2);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 3);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 4);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 5);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 6);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 7);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-
-    d = vdupq_lane_u8(left_high, 0);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 1);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 2);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 3);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 4);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 5);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 6);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 7);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
+    h_store_32x8(&dst, stride, vget_low_u8(left_u8));
+    h_store_32x8(&dst, stride, vget_high_u8(left_u8));
   }
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
index 5cd9170aea7..907e9183804 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
@@ -11,6 +11,7 @@
     EXPORT  |vpx_lpf_horizontal_4_neon|
     EXPORT  |vpx_lpf_vertical_4_neon|
     EXPORT  |vpx_lpf_horizontal_4_dual_neon|
+    EXPORT  |vpx_lpf_vertical_4_dual_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
@@ -54,7 +55,7 @@
     sub         r2, r2, r1, lsl #1
     sub         r3, r3, r1, lsl #1
 
-    bl          vpx_loop_filter_neon
+    bl          filter4_8
 
     vst1.u8     {d4}, [r2@64], r1          ; store op1
     vst1.u8     {d5}, [r3@64], r1          ; store op0
@@ -114,7 +115,7 @@
     vtrn.8      d7, d16
     vtrn.8      d17, d18
 
-    bl          vpx_loop_filter_neon
+    bl          filter4_8
 
     sub         r0, r0, #2
 
@@ -131,7 +132,7 @@
     pop         {pc}
     ENDP        ; |vpx_lpf_vertical_4_neon|
 
-; void vpx_loop_filter_neon();
+; void filter4_8();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -155,7 +156,7 @@
 ; d5    op0
 ; d6    oq0
 ; d7    oq1
-|vpx_loop_filter_neon| PROC
+|filter4_8| PROC
     ; filter_mask
     vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
     vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
@@ -245,7 +246,7 @@
     veor        d7, d20, d18                ; *oq1 = u^0x80
 
     bx          lr
-    ENDP        ; |vpx_loop_filter_neon|
+    ENDP        ; |filter4_8|
 
 ;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
 ;                                    const uint8_t *blimit0,
@@ -300,7 +301,7 @@
     sub         r2, r2, r1, lsl #1
     sub         r3, r3, r1, lsl #1
 
-    bl          vpx_loop_filter_neon_16
+    bl          filter4_16
 
     vst1.u8     {q5}, [r2@64], r1          ; store op1
     vst1.u8     {q6}, [r3@64], r1          ; store op0
@@ -312,7 +313,122 @@
     pop         {pc}
     ENDP        ; |vpx_lpf_horizontal_4_dual_neon|
 
-; void vpx_loop_filter_neon_16();
+;void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+;                                  const uint8_t *blimit0,
+;                                  const uint8_t *limit0,
+;                                  const uint8_t *thresh0,
+;                                  const uint8_t *blimit1,
+;                                  const uint8_t *limit1,
+;                                  const uint8_t *thresh1)
+; r0    uint8_t *s,
+; r1    int p,
+; r2    const uint8_t *blimit0,
+; r3    const uint8_t *limit0,
+; sp    const uint8_t *thresh0,
+; sp+4  const uint8_t *blimit1,
+; sp+8  const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vpx_lpf_vertical_4_dual_neon| PROC
+    push        {lr}
+
+    ldr         r12, [sp, #4]              ; load thresh0
+    vld1.8      {d0}, [r2]                 ; load blimit0 to first half q
+    vld1.8      {d2}, [r3]                 ; load limit0 to first half q
+
+    ldr         r2, [sp, #8]               ; load blimit1
+
+    vld1.8      {d4}, [r12]                ; load thresh0 to first half q
+
+    ldr         r3, [sp, #12]              ; load limit1
+    ldr         r12, [sp, #16]             ; load thresh1
+    vld1.8      {d1}, [r2]                 ; load blimit1 to 2nd half q
+
+    sub         r2, r0, #4                 ; s[-4]
+
+    vld1.8      {d3}, [r3]                 ; load limit1 to 2nd half q
+    vld1.8      {d5}, [r12]                ; load thresh1 to 2nd half q
+
+    vpush       {d8-d15}                   ; save neon registers
+
+    vld1.u8     {d6}, [r2], r1             ; 00 01 02 03 04 05 06 07
+    vld1.u8     {d8}, [r2], r1             ; 10 11 12 13 14 15 16 17
+    vld1.u8     {d10}, [r2], r1            ; 20 21 22 23 24 25 26 27
+    vld1.u8     {d12}, [r2], r1            ; 30 31 32 33 34 35 36 37
+    vld1.u8     {d14}, [r2], r1            ; 40 41 42 43 44 45 46 47
+    vld1.u8     {d16}, [r2], r1            ; 50 51 52 53 54 55 56 57
+    vld1.u8     {d18}, [r2], r1            ; 60 61 62 63 64 65 66 67
+    vld1.u8     {d20}, [r2], r1            ; 70 71 72 73 74 75 76 77
+    vld1.u8     {d7}, [r2], r1             ; 80 81 82 83 84 85 86 87
+    vld1.u8     {d9}, [r2], r1             ; 90 91 92 93 94 95 96 97
+    vld1.u8     {d11}, [r2], r1            ; A0 A1 A2 A3 A4 A5 A6 A7
+    vld1.u8     {d13}, [r2], r1            ; B0 B1 B2 B3 B4 B5 B6 B7
+    vld1.u8     {d15}, [r2], r1            ; C0 C1 C2 C3 C4 C5 C6 C7
+    vld1.u8     {d17}, [r2], r1            ; D0 D1 D2 D3 D4 D5 D6 D7
+    vld1.u8     {d19}, [r2], r1            ; E0 E1 E2 E3 E4 E5 E6 E7
+    vld1.u8     {d21}, [r2]                ; F0 F1 F2 F3 F4 F5 F6 F7
+
+    vtrn.8      q3, q4  ; q3 : 00 10 02 12 04 14 06 16  80 90 82 92 84 94 86 96
+                        ; q4 : 01 11 03 13 05 15 07 17  81 91 83 93 85 95 87 97
+    vtrn.8      q5, q6  ; q5 : 20 30 22 32 24 34 26 36  A0 B0 A2 B2 A4 B4 A6 B6
+                        ; q6 : 21 31 23 33 25 35 27 37  A1 B1 A3 B3 A5 B5 A7 B7
+    vtrn.8      q7, q8  ; q7 : 40 50 42 52 44 54 46 56  C0 D0 C2 D2 C4 D4 C6 D6
+                        ; q8 : 41 51 43 53 45 55 47 57  C1 D1 C3 D3 C5 D5 C7 D7
+    vtrn.8      q9, q10 ; q9 : 60 70 62 72 64 74 66 76  E0 F0 E2 F2 E4 F4 E6 F6
+                        ; q10: 61 71 63 73 65 75 67 77  E1 F1 E3 F3 E5 F5 E7 F7
+
+    vtrn.16     q3, q5  ; q3 : 00 10 20 30 04 14 24 34  80 90 A0 B0 84 94 A4 B4
+                        ; q5 : 02 12 22 32 06 16 26 36  82 92 A2 B2 86 96 A6 B6
+    vtrn.16     q4, q6  ; q4 : 01 11 21 31 05 15 25 35  81 91 A1 B1 85 95 A5 B5
+                        ; q6 : 03 13 23 33 07 17 27 37  83 93 A3 B3 87 97 A7 B7
+    vtrn.16     q7, q9  ; q7 : 40 50 60 70 44 54 64 74  C0 D0 E0 F0 C4 D4 E4 F4
+                        ; q9 : 42 52 62 72 46 56 66 76  C2 D2 E2 F2 C6 D6 E6 F6
+    vtrn.16     q8, q10 ; q8 : 41 51 61 71 45 55 65 75  C1 D1 E1 F1 C5 D5 E5 F5
+                        ; q10: 43 53 63 73 47 57 67 77  C3 D3 E3 F3 C7 D7 E7 F7
+
+    vtrn.32     q3, q7  ; q3 : 00 10 20 30 40 50 60 70  80 90 A0 B0 C0 D0 E0 F0
+                        ; q7 : 04 14 24 34 44 54 64 74  84 94 A4 B4 C4 D4 E4 F4
+    vtrn.32     q5, q9  ; q5 : 02 12 22 32 42 52 62 72  82 92 A2 B2 C2 D2 E2 F2
+                        ; q9 : 06 16 26 36 46 56 66 76  86 96 A6 B6 C6 D6 E6 F6
+    vtrn.32     q4, q8  ; q4 : 01 11 21 31 41 51 61 71  81 91 A1 B1 C1 D1 E1 F1
+                        ; q8 : 05 15 25 35 45 55 65 75  85 95 A5 B5 C5 D5 E5 F5
+    vtrn.32     q6, q10 ; q6 : 03 13 23 33 43 53 63 73  83 93 A3 B3 C3 D3 E3 F3
+                        ; q10: 07 17 27 37 47 57 67 77  87 97 A7 B7 C7 D7 E7 F7
+
+    bl          filter4_16
+
+    sub         r0, #2
+
+    vmov        d0, d11
+    vmov        d1, d13
+    vmov        d2, d15
+    vmov        d3, d17
+    vmov        d11, d12
+    vmov        d12, d14
+    vmov        d13, d16
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r0], r1
+    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r0], r1
+    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r0], r1
+    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r0]
+
+    vpop        {d8-d15}                   ; restore neon registers
+
+    pop         {pc}
+    ENDP        ; |vpx_lpf_vertical_4_dual_neon|
+
+; void filter4_16();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. This function uses
 ; registers d8-d15, so the calling function must save those registers.
@@ -335,7 +451,7 @@
 ; q6    op0
 ; q7    oq0
 ; q8    oq1
-|vpx_loop_filter_neon_16| PROC
+|filter4_16| PROC
 
     ; filter_mask
     vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
@@ -428,6 +544,6 @@
     veor        q8, q12, q10                ; *oq1 = u^0x80
 
     bx          lr
-    ENDP        ; |vpx_loop_filter_neon_16|
+    ENDP        ; |filter4_16|
 
     END
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c
deleted file mode 100644
index ced5aef0ab2..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
index 445add29689..8366ce50b87 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -21,7 +21,7 @@
 //
 // b0.val[0]: 00 01 02 03 16 17 18 19
 // b0.val[1]: 04 05 06 07 20 21 22 23
-static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
+static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   int16x8x2_t b0;
   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
                            vreinterpret_s16_s32(vget_low_s32(a1)));
@@ -30,7 +30,23 @@ static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
   return b0;
 }
 
-static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {
+static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+  int32x4x2_t b0;
+  b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+  b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+  return b0;
+}
+
+static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
+  int64x2x2_t b0;
+  b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
+                           vreinterpret_s64_s32(vget_low_s32(a1)));
+  b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
+                           vreinterpret_s64_s32(vget_high_s32(a1)));
+  return b0;
+}
+
+static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
   uint8x16x2_t b0;
   b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
                           vreinterpret_u8_u32(vget_low_u32(a1)));
@@ -110,6 +126,37 @@ static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
   *a3 = vreinterpret_s16_s32(c1.val[1]);
 }
 
+static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03  10 11 12 13
+  // a1: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const int32x4x2_t b0 =
+      vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1));
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21  02 03 22 23
+  // c0.val[1]: 10 11 30 31  12 13 32 33
+
+  const int32x4_t c0 =
+      vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1]));
+  const int32x4_t c1 =
+      vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1]));
+
+  // Swap 16 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30  02 12 22 32
+  // d0.val[1]: 01 11 21 31  03 13 23 33
+
+  const int16x8x2_t d0 =
+      vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+}
+
 static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
   // Swap 32 bit elements. Goes from:
   // a0: 00 01 02 03  10 11 12 13
@@ -141,6 +188,211 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
   *a1 = d0.val[1];
 }
 
+static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3, const uint8x8_t a4,
+                                    const uint8x8_t a5, const uint8x8_t a6,
+                                    const uint8x8_t a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03 XX XX XX XX
+  // a1: 10 11 12 13 XX XX XX XX
+  // a2: 20 21 22 23 XX XX XX XX
+  // a3; 30 31 32 33 XX XX XX XX
+  // a4: 40 41 42 43 XX XX XX XX
+  // a5: 50 51 52 53 XX XX XX XX
+  // a6: 60 61 62 63 XX XX XX XX
+  // a7: 70 71 72 73 XX XX XX XX
+  // to:
+  // b0.val[0]: 00 01 02 03 40 41 42 43
+  // b1.val[0]: 10 11 12 13 50 51 52 53
+  // b2.val[0]: 20 21 22 23 60 61 62 63
+  // b3.val[0]: 30 31 32 33 70 71 72 73
+
+  const uint32x2x2_t b0 =
+      vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+  const uint32x2x2_t b1 =
+      vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+  const uint32x2x2_t b2 =
+      vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+  const uint32x2x2_t b3 =
+      vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21 40 41 60 61
+  // c0.val[1]: 02 03 22 23 42 43 62 63
+  // c1.val[0]: 10 11 30 31 50 51 70 71
+  // c1.val[1]: 12 13 32 33 52 53 72 73
+
+  const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+                                   vreinterpret_u16_u32(b2.val[0]));
+  const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+                                   vreinterpret_u16_u32(b3.val[0]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 01 11 21 31 41 51 61 71
+  // d1.val[0]: 02 12 22 32 42 52 62 72
+  // d1.val[1]: 03 13 23 33 43 53 63 73
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+  const uint8x8x2_t d1 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+  *a2 = d1.val[0];
+  *a3 = d1.val[1];
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+                                     int32x4_t *a2, int32x4_t *a3) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+  const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+  const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+  *a0 = c0.val[0];
+  *a1 = c1.val[0];
+  *a2 = c0.val[1];
+  *a3 = c1.val[1];
+}
+
+static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
+                                     const int16x4_t a2, const int16x4_t a3,
+                                     const int16x4_t a4, const int16x4_t a5,
+                                     const int16x4_t a6, const int16x4_t a7,
+                                     int16x8_t *const o0, int16x8_t *const o1,
+                                     int16x8_t *const o2, int16x8_t *const o3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  const int16x4x2_t b0 = vtrn_s16(a0, a1);
+  const int16x4x2_t b1 = vtrn_s16(a2, a3);
+  const int16x4x2_t b2 = vtrn_s16(a4, a5);
+  const int16x4x2_t b3 = vtrn_s16(a6, a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+  const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
+                                  vreinterpret_s32_s16(b3.val[0]));
+  const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
+                                  vreinterpret_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
+                     vreinterpret_s16_s32(c2.val[0]));
+  *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
+                     vreinterpret_s16_s32(c3.val[0]));
+  *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
+                     vreinterpret_s16_s32(c2.val[1]));
+  *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
+                     vreinterpret_s16_s32(c3.val[1]));
+}
+
+static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
+                                     int32x4_t *const a2, int32x4_t *const a3,
+                                     int32x4_t *const a4, int32x4_t *const a5,
+                                     int32x4_t *const a6, int32x4_t *const a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+  const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+  const int32x4x2_t b2 = vtrnq_s32(*a4, *a5);
+  const int32x4x2_t b3 = vtrnq_s32(*a6, *a7);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]);
+  const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]);
+  const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]);
+  const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]);
+
+  *a0 = vreinterpretq_s32_s64(c0.val[0]);
+  *a1 = vreinterpretq_s32_s64(c2.val[0]);
+  *a2 = vreinterpretq_s32_s64(c1.val[0]);
+  *a3 = vreinterpretq_s32_s64(c3.val[0]);
+  *a4 = vreinterpretq_s32_s64(c0.val[1]);
+  *a5 = vreinterpretq_s32_s64(c2.val[1]);
+  *a6 = vreinterpretq_s32_s64(c1.val[1]);
+  *a7 = vreinterpretq_s32_s64(c3.val[1]);
+}
+
 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
                                     uint8x8_t *a3) {
   // Swap 8 bit elements. Goes from:
@@ -207,6 +459,59 @@ static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1,
   *a3 = vreinterpretq_u16_u32(c1.val[1]);
 }
 
+static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
+                                     int32x4_t *const a2, int32x4_t *const a3,
+                                     int32x4_t *const a4, int32x4_t *const a5,
+                                     int32x4_t *const a6, int32x4_t *const a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 04 05 06 07
+  // a2: 10 11 12 13
+  // a3: 14 15 16 17
+  // a4: 20 21 22 23
+  // a5: 24 25 26 27
+  // a6: 30 31 32 33
+  // a7: 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 04 14 06 16
+  // b1.val[1]: 05 15 07 17
+  // b2.val[0]: 20 30 22 32
+  // b2.val[1]: 21 31 23 33
+  // b3.val[0]: 24 34 26 36
+  // b3.val[1]: 25 35 27 37
+
+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a2);
+  const int32x4x2_t b1 = vtrnq_s32(*a1, *a3);
+  const int32x4x2_t b2 = vtrnq_s32(*a4, *a6);
+  const int32x4x2_t b3 = vtrnq_s32(*a5, *a7);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 04 14 24 34
+  // c2.val[1]: 06 16 26 36
+  // c3.val[0]: 05 15 25 35
+  // c3.val[1]: 07 17 27 37
+
+  const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]);
+  const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]);
+  const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]);
+  const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]);
+
+  *a0 = vreinterpretq_s32_s64(c0.val[0]);
+  *a1 = vreinterpretq_s32_s64(c1.val[0]);
+  *a2 = vreinterpretq_s32_s64(c0.val[1]);
+  *a3 = vreinterpretq_s32_s64(c1.val[1]);
+  *a4 = vreinterpretq_s32_s64(c2.val[0]);
+  *a5 = vreinterpretq_s32_s64(c3.val[0]);
+  *a6 = vreinterpretq_s32_s64(c2.val[1]);
+  *a7 = vreinterpretq_s32_s64(c3.val[1]);
+}
+
 // Note: Using 'd' registers or 'q' registers has almost identical speed. We use
 // 'q' registers here to save some instructions.
 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
@@ -319,10 +624,10 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
-  const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]);
-  const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]);
-  const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]);
-  const int16x8x2_t d3 = vpx_vtrnq_s64(c1.val[1], c3.val[1]);
+  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
 
   *a0 = d0.val[0];
   *a1 = d1.val[0];
@@ -758,14 +1063,14 @@ static INLINE void transpose_u8_16x16(
   // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E  8E 9E AE BE CE DE EE FE
   // e7.val[0]: 07 17 27 37 47 57 67 77  87 97 A7 B7 C7 D7 E7 F7
   // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F  8F 9F AF BF CF DF EF FF
-  const uint8x16x2_t e0 = vpx_vtrnq_u64(d0.val[0], d4.val[0]);
-  const uint8x16x2_t e1 = vpx_vtrnq_u64(d2.val[0], d6.val[0]);
-  const uint8x16x2_t e2 = vpx_vtrnq_u64(d1.val[0], d5.val[0]);
-  const uint8x16x2_t e3 = vpx_vtrnq_u64(d3.val[0], d7.val[0]);
-  const uint8x16x2_t e4 = vpx_vtrnq_u64(d0.val[1], d4.val[1]);
-  const uint8x16x2_t e5 = vpx_vtrnq_u64(d2.val[1], d6.val[1]);
-  const uint8x16x2_t e6 = vpx_vtrnq_u64(d1.val[1], d5.val[1]);
-  const uint8x16x2_t e7 = vpx_vtrnq_u64(d3.val[1], d7.val[1]);
+  const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]);
+  const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]);
+  const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]);
+  const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]);
+  const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]);
+  const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]);
+  const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]);
+  const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]);
 
   // Output:
   // o0 : 00 10 20 30 40 50 60 70  80 90 A0 B0 C0 D0 E0 F0
@@ -802,4 +1107,101 @@ static INLINE void transpose_u8_16x16(
   *o15 = e7.val[1];
 }
 
+static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
+                                             const int a_stride, uint8x8_t *a0,
+                                             uint8x8_t *a1, uint8x8_t *a2,
+                                             uint8x8_t *a3) {
+  uint8x8_t a4, a5, a6, a7;
+  *a0 = vld1_u8(a);
+  a += a_stride;
+  *a1 = vld1_u8(a);
+  a += a_stride;
+  *a2 = vld1_u8(a);
+  a += a_stride;
+  *a3 = vld1_u8(a);
+  a += a_stride;
+  a4 = vld1_u8(a);
+  a += a_stride;
+  a5 = vld1_u8(a);
+  a += a_stride;
+  a6 = vld1_u8(a);
+  a += a_stride;
+  a7 = vld1_u8(a);
+
+  transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void load_and_transpose_u8_8x8(const uint8_t *a,
+                                             const int a_stride, uint8x8_t *a0,
+                                             uint8x8_t *a1, uint8x8_t *a2,
+                                             uint8x8_t *a3, uint8x8_t *a4,
+                                             uint8x8_t *a5, uint8x8_t *a6,
+                                             uint8x8_t *a7) {
+  *a0 = vld1_u8(a);
+  a += a_stride;
+  *a1 = vld1_u8(a);
+  a += a_stride;
+  *a2 = vld1_u8(a);
+  a += a_stride;
+  *a3 = vld1_u8(a);
+  a += a_stride;
+  *a4 = vld1_u8(a);
+  a += a_stride;
+  *a5 = vld1_u8(a);
+  a += a_stride;
+  *a6 = vld1_u8(a);
+  a += a_stride;
+  *a7 = vld1_u8(a);
+
+  transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride,
+                                              uint8x8_t a0, uint8x8_t a1,
+                                              uint8x8_t a2, uint8x8_t a3,
+                                              uint8x8_t a4, uint8x8_t a5,
+                                              uint8x8_t a6, uint8x8_t a7) {
+  transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  vst1_u8(a, a0);
+  a += a_stride;
+  vst1_u8(a, a1);
+  a += a_stride;
+  vst1_u8(a, a2);
+  a += a_stride;
+  vst1_u8(a, a3);
+  a += a_stride;
+  vst1_u8(a, a4);
+  a += a_stride;
+  vst1_u8(a, a5);
+  a += a_stride;
+  vst1_u8(a, a6);
+  a += a_stride;
+  vst1_u8(a, a7);
+}
+
+static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
+                                              const int a_stride, int16x8_t *a0,
+                                              int16x8_t *a1, int16x8_t *a2,
+                                              int16x8_t *a3, int16x8_t *a4,
+                                              int16x8_t *a5, int16x8_t *a6,
+                                              int16x8_t *a7) {
+  *a0 = vld1q_s16(a);
+  a += a_stride;
+  *a1 = vld1q_s16(a);
+  a += a_stride;
+  *a2 = vld1q_s16(a);
+  a += a_stride;
+  *a3 = vld1q_s16(a);
+  a += a_stride;
+  *a4 = vld1q_s16(a);
+  a += a_stride;
+  *a5 = vld1q_s16(a);
+  a += a_stride;
+  *a6 = vld1q_s16(a);
+  a += a_stride;
+  *a7 = vld1q_s16(a);
+
+  transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
 #endif  // VPX_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c
index 589b124e26a..6c27484979a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c
@@ -117,7 +117,7 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
   unsigned char d[16];
 
   for (r = 0; r < rows; r++) {
-    int sumsq = 0;
+    int sumsq = 16;
     int sum = 0;
 
     for (i = -8; i < 0; i++) s[i] = s[0];
@@ -156,14 +156,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
                             int flimit) {
   int r, c, i;
-  const int16_t *rv3 = &vpx_rv[63 & rand()];
 
   for (c = 0; c < cols; c++) {
     unsigned char *s = &dst[c];
     int sumsq = 0;
     int sum = 0;
     unsigned char d[16];
-    const int16_t *rv2 = rv3 + ((c * 17) & 127);
 
     for (i = -8; i < 0; i++) s[i * pitch] = s[0];
 
@@ -183,7 +181,7 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
       d[r & 15] = s[0];
 
       if (sumsq * 15 - sum * sum < flimit) {
-        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+        d[r & 15] = (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4;
       }
       if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
       s += pitch;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
index f3f543ddfe8..0f9aff1892a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
@@ -67,7 +67,7 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   }
 }
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
+void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
   int i;
   tran_high_t a1, e1;
   tran_low_t tmp[4];
@@ -84,10 +84,10 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
   for (i = 0; i < 4; i++) {
     e1 = ip[0] >> 1;
     a1 = ip[0] - e1;
-    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
-    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
-    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
-    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
+    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
+    dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
+    dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
+    dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
     ip++;
     dest++;
   }
@@ -138,8 +138,7 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   }
 }
 
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
-                         int dest_stride) {
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i;
   tran_high_t a1;
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
@@ -152,7 +151,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
     dest[1] = clip_pixel_add(dest[1], a1);
     dest[2] = clip_pixel_add(dest[2], a1);
     dest[3] = clip_pixel_add(dest[3], a1);
-    dest += dest_stride;
+    dest += stride;
   }
 }
 
@@ -1324,7 +1323,7 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
 }
 
 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
-                                int dest_stride, int bd) {
+                                int stride, int bd) {
   int i;
   tran_high_t a1, e1;
   tran_low_t tmp[4];
@@ -1343,14 +1342,10 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   for (i = 0; i < 4; i++) {
     e1 = ip[0] >> 1;
     a1 = ip[0] - e1;
-    dest[dest_stride * 0] =
-        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
-    dest[dest_stride * 1] =
-        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
-    dest[dest_stride * 2] =
-        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
-    dest[dest_stride * 3] =
-        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
     ip++;
     dest++;
   }
@@ -1413,7 +1408,7 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
 }
 
 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int dest_stride, int bd) {
+                                int stride, int bd) {
   int i;
   tran_high_t a1;
   tran_low_t out =
@@ -1428,7 +1423,7 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
-    dest += dest_stride;
+    dest += stride;
   }
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
index 31812299c34..b4ed6ee850a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -403,8 +403,11 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_y, int filter_y_stride, int w,
                             int h) {
   int x, y;
-  uint32_t tp1, tp2, tn1;
-  uint32_t tp3, tp4, tn2;
+  uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
 
   /* prefetch data to cache memory */
   prefetch_load(src);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c
index f6812c7d049..8d35b6394e2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c
@@ -1307,6 +1307,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
+  (void)x_step_q4;
 
   /* bit positon for extract from acc */
   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
@@ -1398,6 +1399,10 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_y, int filter_y_stride,
                              int w, int h) {
   int x, y;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
 
   /* prefetch data to cache memory */
   prefetch_load(src);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
index cc633c6698d..e33ea740a9e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
@@ -459,7 +459,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
 
   flimit_vec = __msa_fill_w(flimit);
   for (row = rows; row--;) {
-    int32_t sum_sq = 0;
+    int32_t sum_sq;
     int32_t sum = 0;
     src0 = (v16u8)__msa_fill_b(src_dup[0]);
     ST8x1_UB(src0, (src_dup - 8));
@@ -474,7 +474,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
     ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
     src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
     src_r_w += __msa_dotp_u_w(src_l_h, src_l_h);
-    sum_sq = HADD_SW_S32(src_r_w);
+    sum_sq = HADD_SW_S32(src_r_w) + 16;
     sum_h = __msa_hadd_u_h(src, src);
     sum = HADD_UH_U32(sum_h);
     {
@@ -573,7 +573,6 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
 void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
                               int32_t cols, int32_t flimit) {
   int32_t row, col, cnt, i;
-  const int16_t *rv3 = &vpx_rv[63 & rand()];
   v4i32 flimit_vec;
   v16u8 dst7, dst8, dst_r_b, dst_l_b;
   v16i8 mask;
@@ -601,7 +600,7 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
 
     dst = LD_UB(dst_tmp);
     for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
-      rv2[i] = rv3 + ((cnt * 17) & 127);
+      rv2[i] = vpx_rv + (i & 7);
       ++i;
     }
     for (cnt = -8; cnt < 0; ++cnt) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
index 3e29d0ac39f..835e10e125c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
@@ -15,6 +15,7 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
   int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  (void)above;
 
   __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                    \n\t"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
index 9f51d50c752..dce03a2b2a0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
@@ -14,6 +14,7 @@
 void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   int32_t tmp1, tmp2, tmp3, tmp4;
+  (void)above;
 
   __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                    \n\t"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
index eac79d51000..16e7fc55079 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
@@ -14,6 +14,7 @@
 void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  (void)above;
 
   __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                   \n\t"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
index edd54aec5e2..27881f0db6c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
@@ -57,18 +57,15 @@ extern "C" {
     out;                                                                       \
   })
 
-void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                   int dest_stride);
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output);
-void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                     int dest_stride);
+void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
 void iadst4_dspr2(const int16_t *input, int16_t *output);
 void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                 int dest_stride);
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
 void iadst8_dspr2(const int16_t *input, int16_t *output);
 void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
 void iadst16_dspr2(const int16_t *input, int16_t *output);
 
 #endif  // #if HAVE_DSPR2
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c
index 0ec0c2059f4..44ba65c7ac8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c
@@ -389,7 +389,7 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
   }
 }
 
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
   int i;
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int step1_8, step1_9, step1_10, step1_11;
@@ -712,14 +712,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
         "add      %[load6],         %[load6],           %[step1_14]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
@@ -731,14 +731,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
         "add      %[load6],         %[load6],           %[step1_12]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
@@ -750,14 +750,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
         "add      %[load6],         %[load6],           %[step1_10]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
@@ -769,14 +769,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
         "add      %[load6],         %[load6],           %[step1_8]      \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
@@ -788,14 +788,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
         "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
@@ -807,14 +807,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
         "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
@@ -826,14 +826,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
         "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
@@ -845,7 +845,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
         "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
@@ -856,7 +856,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
           [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
         :
-        [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+        [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
         [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
         [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
         [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
@@ -869,7 +869,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
 }
 
 void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
+                                 int stride) {
   DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
   uint32_t pos = 45;
 
@@ -880,11 +880,11 @@ void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
   idct16_rows_dspr2(input, out, 16);
 
   // Then transform columns and add to dest
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, stride);
 }
 
 void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
-                                int dest_stride) {
+                                int stride) {
   DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
   int16_t *outptr = out;
   uint32_t i;
@@ -924,11 +924,11 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
   }
 
   // Then transform columns
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, stride);
 }
 
 void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                               int dest_stride) {
+                               int stride) {
   uint32_t pos = 45;
   int32_t out;
   int32_t r;
@@ -975,13 +975,54 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "sw             %[vector_2],    4(%[dest])                      \n\t"
           "sw             %[vector_3],    8(%[dest])                      \n\t"
           "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
 
           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
             [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+    }
+  } else if (a1 > 255) {
+    int32_t a11, a12, vector_a11, vector_a12;
+
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    a11 = a1 >> 1;
+    a12 = a1 - a11;
+    __asm__ __volatile__(
+        "replv.qb       %[vector_a11],  %[a11]     \n\t"
+        "replv.qb       %[vector_a12],  %[a12]     \n\t"
+
+        : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+        : [a11] "r"(a11), [a12] "r"(a12));
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__(
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_3],    %[vector_3],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_4],    %[vector_4],    %[vector_a12]   \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+            [dest] "+&r"(dest)
+          : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+            [vector_a12] "r"(vector_a12));
     }
   } else {
     /* use quad-byte
@@ -1005,13 +1046,13 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "sw             %[vector_2],    4(%[dest])                      \n\t"
           "sw             %[vector_3],    8(%[dest])                      \n\t"
           "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
 
           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
             [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
     }
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
index ce25d55c9c0..3f043b48baf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
@@ -13,26 +13,25 @@
 #include "vpx_dsp/txfm_common.h"
 
 #if HAVE_DSPR2
-void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                   int dest_stride) {
-  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
-  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
-  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
-  int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
-  int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
-  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
-  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
-  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
-  int16_t step2_28, step2_29, step2_30, step2_31;
-  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
-  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
-  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
-  int16_t step3_28, step3_29, step3_30, step3_31;
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+  int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+  int step1_28, step1_29, step1_30, step1_31;
+  int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int step2_28, step2_29, step2_30, step2_31;
+  int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+  int step3_29, step3_30, step3_31;
   int temp0, temp1, temp2, temp3;
   int load1, load2, load3, load4;
   int result1, result2;
-  int i, temp21;
+  int i;
   uint8_t *dest_pix, *dest_pix1;
   const int const_2_power_13 = 8192;
   uint8_t *cm = vpx_ff_cropTbl;
@@ -49,7 +48,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
   for (i = 0; i < 32; ++i) {
     dest_pix = dest + i;
-    dest_pix1 = dest + i + 31 * dest_stride;
+    dest_pix1 = dest + i + 31 * stride;
 
     __asm__ __volatile__(
         "lh       %[load1],             2(%[input])                     \n\t"
@@ -103,9 +102,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
-          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
-          [step1_31] "=r"(step1_31)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
+          [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
@@ -163,9 +162,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
-          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
-          [step1_29] "=r"(step1_29)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
+          [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
@@ -223,9 +222,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
-          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
-          [step1_27] "=r"(step1_27)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
+          [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
           [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
@@ -279,9 +278,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
-          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
-          [step1_25] "=r"(step1_25)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
+          [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
           [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
@@ -335,9 +334,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
-          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
-          [step2_15] "=r"(step2_15)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
+          [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
+          [step2_15] "=&r"(step2_15)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
           [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
@@ -391,9 +390,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
+          [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
           [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
@@ -434,116 +433,154 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "extp     %[step3_11],          $ac2,           31              \n\t"
         "extp     %[step3_12],          $ac3,           31              \n\t"
 
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
-          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
-          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
-          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
-          [step3_15] "=r"(step3_15)
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
+          [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
+          [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
+          [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
+          [step3_15] "=&r"(step3_15)
         : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
           [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
           [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
           [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
           [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
 
-    step2_18 = step1_17 - step1_18;
-    step2_29 = step1_30 - step1_29;
-
     __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_18],          $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_17],    %[step1_18]     \n\t"
+        "sub      %[temp1],             %[step1_30],    %[step1_29]     \n\t"
+        "add      %[step3_17],          %[step1_17],    %[step1_18]     \n\t"
+        "add      %[step3_30],          %[step1_30],    %[step1_29]     \n\t"
 
-        : [step3_18] "=r"(step3_18)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
-          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_29],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
+          [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
+          [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
+          [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
           [cospi_8_64] "r"(cospi_8_64));
 
-    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
-    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_19 = step1_16 - step1_19;
-    step2_28 = step1_31 - step1_28;
-
     __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_19],          $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_16],    %[step1_19]     \n\t"
+        "sub      %[temp1],             %[step1_31],    %[step1_28]     \n\t"
+        "add      %[step3_16],          %[step1_16],    %[step1_19]     \n\t"
+        "add      %[step3_31],          %[step1_31],    %[step1_28]     \n\t"
 
-        : [step3_19] "=r"(step3_19)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
-          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_28],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
+          [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
+          [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
+          [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
           [cospi_8_64] "r"(cospi_8_64));
 
-    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
-    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_16 = step1_16 + step1_19;
-    step3_17 = step1_17 + step1_18;
-    step3_30 = step1_29 + step1_30;
-    step3_31 = step1_28 + step1_31;
-
-    step2_20 = step1_23 - step1_20;
-    step2_27 = step1_24 - step1_27;
-
     __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_20],          $ac0,           31              \n\t"
-
-        : [step3_20] "=r"(step3_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_23],    %[step1_20]     \n\t"
+        "sub      %[temp1],             %[step1_24],    %[step1_27]     \n\t"
+        "add      %[step3_23],          %[step1_23],    %[step1_20]     \n\t"
+        "add      %[step3_24],          %[step1_24],    %[step1_27]     \n\t"
+
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_27],          $ac0,           31              \n\t"
+        "msub     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
+          [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
+          [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
+          [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
           [cospi_8_64] "r"(cospi_8_64));
 
-    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
-    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_21 = step1_22 - step1_21;
-    step2_26 = step1_25 - step1_26;
-
     __asm__ __volatile__(
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
         "mthi     $zero,                $ac1                            \n\t"
-        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_21],          $ac1,           31              \n\t"
-
-        : [step3_21] "=r"(step3_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
-          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+        "sub      %[temp0],             %[step1_22],    %[step1_21]     \n\t"
+        "sub      %[temp1],             %[step1_25],    %[step1_26]     \n\t"
+        "add      %[step3_22],          %[step1_22],    %[step1_21]     \n\t"
+        "add      %[step3_25],          %[step1_25],    %[step1_26]     \n\t"
+
+        "msub     $ac0,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac0,           31              \n\t"
+        "msub     $ac1,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_26],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
+          [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
+          [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
+          [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
           [cospi_8_64] "r"(cospi_8_64));
 
-    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
-    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_22 = step1_21 + step1_22;
-    step3_23 = step1_20 + step1_23;
-    step3_24 = step1_24 + step1_27;
-    step3_25 = step1_25 + step1_26;
-
-    step2_16 = step3_16 + step3_23;
-    step2_17 = step3_17 + step3_22;
-    step2_18 = step3_18 + step3_21;
-    step2_19 = step3_19 + step3_20;
-    step2_20 = step3_19 - step3_20;
-    step2_21 = step3_18 - step3_21;
-    step2_22 = step3_17 - step3_22;
-    step2_23 = step3_16 - step3_23;
-
-    step2_24 = step3_31 - step3_24;
-    step2_25 = step3_30 - step3_25;
-    step2_26 = step3_29 - step3_26;
-    step2_27 = step3_28 - step3_27;
-    step2_28 = step3_28 + step3_27;
-    step2_29 = step3_29 + step3_26;
-    step2_30 = step3_30 + step3_25;
-    step2_31 = step3_31 + step3_24;
+    __asm__ __volatile__(
+        "add      %[step2_16],          %[step3_16],    %[step3_23]     \n\t"
+        "add      %[step2_17],          %[step3_17],    %[step3_22]     \n\t"
+        "add      %[step2_18],          %[step3_18],    %[step3_21]     \n\t"
+        "add      %[step2_19],          %[step3_19],    %[step3_20]     \n\t"
+        "sub      %[step2_20],          %[step3_19],    %[step3_20]     \n\t"
+        "sub      %[step2_21],          %[step3_18],    %[step3_21]     \n\t"
+        "sub      %[step2_22],          %[step3_17],    %[step3_22]     \n\t"
+        "sub      %[step2_23],          %[step3_16],    %[step3_23]     \n\t"
+
+        : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
+          [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
+          [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
+          [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
+        : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
+          [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
+          [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
+          [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
+
+    __asm__ __volatile__(
+        "sub      %[step2_24],          %[step3_31],    %[step3_24]     \n\t"
+        "sub      %[step2_25],          %[step3_30],    %[step3_25]     \n\t"
+        "sub      %[step2_26],          %[step3_29],    %[step3_26]     \n\t"
+        "sub      %[step2_27],          %[step3_28],    %[step3_27]     \n\t"
+        "add      %[step2_28],          %[step3_28],    %[step3_27]     \n\t"
+        "add      %[step2_29],          %[step3_29],    %[step3_26]     \n\t"
+        "add      %[step2_30],          %[step3_30],    %[step3_25]     \n\t"
+        "add      %[step2_31],          %[step3_31],    %[step3_24]     \n\t"
+
+        : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
+          [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
+          [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
+          [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
+        : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
+          [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
+          [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
+          [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
 
     __asm__ __volatile__(
         "lh       %[load1],             0(%[input])                     \n\t"
@@ -580,9 +617,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [result1] "=&r"(result1),
           [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
+          [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
+          [step1_3] "=&r"(step1_3)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
           [cospi_16_64] "r"(cospi_16_64));
@@ -638,96 +675,137 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
+          [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
+          [step1_7] "=&r"(step1_7)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
           [cospi_16_64] "r"(cospi_16_64));
 
-    step2_0 = step1_0 + step1_7;
-    step2_1 = step1_1 + step1_6;
-    step2_2 = step1_2 + step1_5;
-    step2_3 = step1_3 + step1_4;
-    step2_4 = step1_3 - step1_4;
-    step2_5 = step1_2 - step1_5;
-    step2_6 = step1_1 - step1_6;
-    step2_7 = step1_0 - step1_7;
+    __asm__ __volatile__(
+        "add      %[step2_0],          %[step1_0],    %[step1_7]     \n\t"
+        "add      %[step2_1],          %[step1_1],    %[step1_6]     \n\t"
+        "add      %[step2_2],          %[step1_2],    %[step1_5]     \n\t"
+        "add      %[step2_3],          %[step1_3],    %[step1_4]     \n\t"
+        "sub      %[step2_4],          %[step1_3],    %[step1_4]     \n\t"
+        "sub      %[step2_5],          %[step1_2],    %[step1_5]     \n\t"
+        "sub      %[step2_6],          %[step1_1],    %[step1_6]     \n\t"
+        "sub      %[step2_7],          %[step1_0],    %[step1_7]     \n\t"
+
+        : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
+          [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
+          [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
+          [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
+        : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
+          [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
+          [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
+          [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
 
     // stage 7
-    step1_0 = step2_0 + step3_15;
-    step1_1 = step2_1 + step3_14;
-    step1_2 = step2_2 + step3_13;
-    step1_3 = step2_3 + step3_12;
-    step1_4 = step2_4 + step3_11;
-    step1_5 = step2_5 + step3_10;
-    step1_6 = step2_6 + step3_9;
-    step1_7 = step2_7 + step3_8;
-    step1_8 = step2_7 - step3_8;
-    step1_9 = step2_6 - step3_9;
-    step1_10 = step2_5 - step3_10;
-    step1_11 = step2_4 - step3_11;
-    step1_12 = step2_3 - step3_12;
-    step1_13 = step2_2 - step3_13;
-    step1_14 = step2_1 - step3_14;
-    step1_15 = step2_0 - step3_15;
-
     __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_20],          $ac0,           31              \n\t"
+        "add      %[step1_0],          %[step2_0],    %[step3_15]     \n\t"
+        "add      %[step1_1],          %[step2_1],    %[step3_14]     \n\t"
+        "add      %[step1_2],          %[step2_2],    %[step3_13]     \n\t"
+        "add      %[step1_3],          %[step2_3],    %[step3_12]     \n\t"
+        "sub      %[step1_12],         %[step2_3],    %[step3_12]     \n\t"
+        "sub      %[step1_13],         %[step2_2],    %[step3_13]     \n\t"
+        "sub      %[step1_14],         %[step2_1],    %[step3_14]     \n\t"
+        "sub      %[step1_15],         %[step2_0],    %[step3_15]     \n\t"
+
+        : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
+          [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
+          [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
+          [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
+        : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
+          [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
+          [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
+          [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
 
-        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_20 + step2_27) * cospi_16_64;
-    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+    __asm__ __volatile__(
+        "add      %[step1_4],          %[step2_4],    %[step3_11]     \n\t"
+        "add      %[step1_5],          %[step2_5],    %[step3_10]     \n\t"
+        "add      %[step1_6],          %[step2_6],    %[step3_9]      \n\t"
+        "add      %[step1_7],          %[step2_7],    %[step3_8]      \n\t"
+        "sub      %[step1_8],          %[step2_7],    %[step3_8]      \n\t"
+        "sub      %[step1_9],          %[step2_6],    %[step3_9]      \n\t"
+        "sub      %[step1_10],         %[step2_5],    %[step3_10]     \n\t"
+        "sub      %[step1_11],         %[step2_4],    %[step3_11]     \n\t"
+
+        : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
+          [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
+          [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
+          [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
+        : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
+          [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
+          [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
+          [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
 
     __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "add      %[temp1],             %[step2_27],    %[step2_20]     \n\t"
+        "sub      %[temp2],             %[step2_26],    %[step2_21]     \n\t"
+        "add      %[temp3],             %[step2_26],    %[step2_21]     \n\t"
+
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_21],          $ac0,           31              \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
 
-        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
-          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
+        "extp     %[step1_20],          $ac0,           31              \n\t"
+        "extp     %[step1_27],          $ac1,           31              \n\t"
+        "extp     %[step1_21],          $ac2,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
 
-    temp21 = (step2_21 + step2_26) * cospi_16_64;
-    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
+          [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
+          [step1_26] "=&r"(step1_26)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+          [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
+          [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
 
     __asm__ __volatile__(
         "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_22],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
-          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
+        "add      %[temp1],             %[step2_25],    %[step2_22]     \n\t"
+        "sub      %[temp2],             %[step2_24],    %[step2_23]     \n\t"
+        "add      %[temp3],             %[step2_24],    %[step2_23]     \n\t"
 
-    temp21 = (step2_22 + step2_25) * cospi_16_64;
-    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_23],          $ac0,           31              \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
 
-        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
-          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
+        "extp     %[step1_22],          $ac0,           31              \n\t"
+        "extp     %[step1_25],          $ac1,           31              \n\t"
+        "extp     %[step1_23],          $ac2,           31              \n\t"
+        "extp     %[step1_24],          $ac3,           31              \n\t"
 
-    temp21 = (step2_23 + step2_24) * cospi_16_64;
-    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
+          [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
+          [step1_24] "=&r"(step1_24)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
+          [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
+          [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
 
     __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
@@ -738,14 +816,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"
@@ -755,18 +833,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+        : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
           [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
           [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
           [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
@@ -782,29 +860,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+          [step3_15] "r"(step3_15));
 
     __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
@@ -815,14 +893,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"
@@ -832,18 +910,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
+        : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4),
           [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
           [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
           [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
@@ -859,29 +937,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+          [step3_15] "r"(step3_15));
 
     __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
@@ -892,14 +970,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"
@@ -909,18 +987,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
+        : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8),
           [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
           [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
           [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
@@ -936,29 +1014,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+          [step3_15] "r"(step3_15));
 
     __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
@@ -969,14 +1047,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"
@@ -986,7 +1064,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
@@ -996,11 +1074,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
-          [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
-          [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
-          [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
+        : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12),
+          [step1_13] "r"(step1_13), [step1_14] "r"(step1_14),
+          [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
+          [step2_17] "r"(step2_17), [step2_18] "r"(step2_18),
+          [step2_19] "r"(step2_19));
 
     step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
     step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
@@ -1012,18 +1090,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
@@ -1031,9 +1109,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+          [step3_15] "r"(step3_15));
 
     input += 32;
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c
index d71c5ffed51..3c0468c00fa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c
@@ -18,24 +18,23 @@
 #if HAVE_DSPR2
 static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
                               uint32_t no_rows) {
-  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
-  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
-  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
-  int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
-  int16_t step1_28, step1_29, step1_30, step1_31;
-  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
-  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
-  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
-  int16_t step2_28, step2_29, step2_30, step2_31;
-  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
-  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
-  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
-  int16_t step3_29, step3_30, step3_31;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+  int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+  int step1_28, step1_29, step1_30, step1_31;
+  int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int step2_28, step2_29, step2_30, step2_31;
+  int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+  int step3_29, step3_30, step3_31;
   int temp0, temp1, temp2, temp3;
   int load1, load2, load3, load4;
   int result1, result2;
-  int temp21;
   int i;
   const int const_2_power_13 = 8192;
   const int32_t *input_int;
@@ -147,9 +146,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
-          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
-          [step1_31] "=r"(step1_31)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
+          [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
@@ -207,9 +206,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
-          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
-          [step1_29] "=r"(step1_29)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
+          [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
@@ -267,9 +266,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
-          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
-          [step1_27] "=r"(step1_27)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
+          [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
           [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
@@ -289,7 +288,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
         "extp     %[temp0],             $ac1,           31              \n\t"
-
         "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
         "extp     %[temp3],             $ac3,           31              \n\t"
@@ -302,7 +300,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
         "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
         "extp     %[temp1],             $ac2,           31              \n\t"
-
         "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
         "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
         "extp     %[temp2],             $ac1,           31              \n\t"
@@ -314,12 +311,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
         "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
         "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
         "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
-
         "extp     %[step1_22],          $ac1,           31              \n\t"
         "extp     %[step1_25],          $ac3,           31              \n\t"
         "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
@@ -327,9 +322,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
-          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
-          [step1_25] "=r"(step1_25)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
+          [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
           [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
@@ -349,7 +344,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
         "extp     %[temp0],             $ac1,           31              \n\t"
-
         "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
         "extp     %[temp3],             $ac3,           31              \n\t"
@@ -362,7 +356,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
         "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
         "extp     %[temp1],             $ac2,           31              \n\t"
-
         "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
         "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
         "extp     %[temp2],             $ac1,           31              \n\t"
@@ -374,12 +367,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
         "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
-
         "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
         "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
         "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
-
         "extp     %[step2_9],           $ac1,           31              \n\t"
         "extp     %[step2_14],          $ac3,           31              \n\t"
         "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
@@ -387,9 +378,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
-          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
-          [step2_15] "=r"(step2_15)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
+          [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
+          [step2_15] "=&r"(step2_15)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
           [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
@@ -409,7 +400,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
         "extp     %[temp0],             $ac1,           31              \n\t"
-
         "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
         "extp     %[temp3],             $ac3,           31              \n\t"
@@ -422,7 +412,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
         "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
         "extp     %[temp1],             $ac2,           31              \n\t"
-
         "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
         "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
         "extp     %[temp2],             $ac1,           31              \n\t"
@@ -434,12 +423,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
         "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
         "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
         "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
         "extp     %[step2_10],          $ac1,           31              \n\t"
         "extp     %[step2_13],          $ac3,           31              \n\t"
         "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
@@ -447,9 +434,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
+          [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
           [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
@@ -462,21 +449,18 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
         "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
         "mthi     $zero,                $ac1                            \n\t"
         "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
         "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
         "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
         "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
-
         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
         "mthi     $zero,                $ac2                            \n\t"
         "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
         "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
         "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
         "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
-
         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
         "mthi     $zero,                $ac3                            \n\t"
         "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
@@ -488,122 +472,159 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
         "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
         "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
-
         "extp     %[step3_10],          $ac0,           31              \n\t"
         "extp     %[step3_13],          $ac1,           31              \n\t"
         "extp     %[step3_11],          $ac2,           31              \n\t"
         "extp     %[step3_12],          $ac3,           31              \n\t"
 
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
-          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
-          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
-          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
-          [step3_15] "=r"(step3_15)
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
+          [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
+          [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
+          [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
+          [step3_15] "=&r"(step3_15)
         : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
           [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
           [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
           [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
           [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
 
-    step2_18 = step1_17 - step1_18;
-    step2_29 = step1_30 - step1_29;
-
     __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_18],          $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_17],    %[step1_18]     \n\t"
+        "sub      %[temp1],             %[step1_30],    %[step1_29]     \n\t"
+        "add      %[step3_17],          %[step1_17],    %[step1_18]     \n\t"
+        "add      %[step3_30],          %[step1_30],    %[step1_29]     \n\t"
 
-        : [step3_18] "=r"(step3_18)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
-          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_29],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
+          [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
+          [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
+          [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
           [cospi_8_64] "r"(cospi_8_64));
 
-    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
-    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_19 = step1_16 - step1_19;
-    step2_28 = step1_31 - step1_28;
-
     __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_19],          $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_16],    %[step1_19]     \n\t"
+        "sub      %[temp1],             %[step1_31],    %[step1_28]     \n\t"
+        "add      %[step3_16],          %[step1_16],    %[step1_19]     \n\t"
+        "add      %[step3_31],          %[step1_31],    %[step1_28]     \n\t"
 
-        : [step3_19] "=r"(step3_19)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
-          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_28],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
+          [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
+          [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
+          [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
           [cospi_8_64] "r"(cospi_8_64));
 
-    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
-    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_16 = step1_16 + step1_19;
-    step3_17 = step1_17 + step1_18;
-    step3_30 = step1_29 + step1_30;
-    step3_31 = step1_28 + step1_31;
-
-    step2_20 = step1_23 - step1_20;
-    step2_27 = step1_24 - step1_27;
-
     __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_20],          $ac0,           31              \n\t"
-
-        : [step3_20] "=r"(step3_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_23],    %[step1_20]     \n\t"
+        "sub      %[temp1],             %[step1_24],    %[step1_27]     \n\t"
+        "add      %[step3_23],          %[step1_23],    %[step1_20]     \n\t"
+        "add      %[step3_24],          %[step1_24],    %[step1_27]     \n\t"
+
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_27],          $ac0,           31              \n\t"
+        "msub     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
+          [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
+          [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
+          [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
           [cospi_8_64] "r"(cospi_8_64));
 
-    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
-    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_21 = step1_22 - step1_21;
-    step2_26 = step1_25 - step1_26;
-
     __asm__ __volatile__(
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
         "mthi     $zero,                $ac1                            \n\t"
-        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_21],          $ac1,           31              \n\t"
-
-        : [step3_21] "=r"(step3_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
-          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+        "sub      %[temp0],             %[step1_22],    %[step1_21]     \n\t"
+        "sub      %[temp1],             %[step1_25],    %[step1_26]     \n\t"
+        "add      %[step3_22],          %[step1_22],    %[step1_21]     \n\t"
+        "add      %[step3_25],          %[step1_25],    %[step1_26]     \n\t"
+
+        "msub     $ac0,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac0,           31              \n\t"
+        "msub     $ac1,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_26],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
+          [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
+          [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
+          [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
           [cospi_8_64] "r"(cospi_8_64));
 
-    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
-    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_22 = step1_21 + step1_22;
-    step3_23 = step1_20 + step1_23;
-    step3_24 = step1_24 + step1_27;
-    step3_25 = step1_25 + step1_26;
-
-    step2_16 = step3_16 + step3_23;
-    step2_17 = step3_17 + step3_22;
-    step2_18 = step3_18 + step3_21;
-    step2_19 = step3_19 + step3_20;
-    step2_20 = step3_19 - step3_20;
-    step2_21 = step3_18 - step3_21;
-    step2_22 = step3_17 - step3_22;
-    step2_23 = step3_16 - step3_23;
-
-    step2_24 = step3_31 - step3_24;
-    step2_25 = step3_30 - step3_25;
-    step2_26 = step3_29 - step3_26;
-    step2_27 = step3_28 - step3_27;
-    step2_28 = step3_28 + step3_27;
-    step2_29 = step3_29 + step3_26;
-    step2_30 = step3_30 + step3_25;
-    step2_31 = step3_31 + step3_24;
+    __asm__ __volatile__(
+        "add      %[step2_16],          %[step3_16],    %[step3_23]     \n\t"
+        "add      %[step2_17],          %[step3_17],    %[step3_22]     \n\t"
+        "add      %[step2_18],          %[step3_18],    %[step3_21]     \n\t"
+        "add      %[step2_19],          %[step3_19],    %[step3_20]     \n\t"
+        "sub      %[step2_20],          %[step3_19],    %[step3_20]     \n\t"
+        "sub      %[step2_21],          %[step3_18],    %[step3_21]     \n\t"
+        "sub      %[step2_22],          %[step3_17],    %[step3_22]     \n\t"
+        "sub      %[step2_23],          %[step3_16],    %[step3_23]     \n\t"
+
+        : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
+          [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
+          [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
+          [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
+        : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
+          [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
+          [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
+          [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
+
+    __asm__ __volatile__(
+        "sub      %[step2_24],          %[step3_31],    %[step3_24]     \n\t"
+        "sub      %[step2_25],          %[step3_30],    %[step3_25]     \n\t"
+        "sub      %[step2_26],          %[step3_29],    %[step3_26]     \n\t"
+        "sub      %[step2_27],          %[step3_28],    %[step3_27]     \n\t"
+        "add      %[step2_28],          %[step3_28],    %[step3_27]     \n\t"
+        "add      %[step2_29],          %[step3_29],    %[step3_26]     \n\t"
+        "add      %[step2_30],          %[step3_30],    %[step3_25]     \n\t"
+        "add      %[step2_31],          %[step3_31],    %[step3_24]     \n\t"
+
+        : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
+          [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
+          [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
+          [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
+        : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
+          [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
+          [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
+          [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
 
     __asm__ __volatile__(
         "lh       %[load1],             0(%[input])                     \n\t"
@@ -627,29 +648,25 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
         "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
         "extp     %[temp2],             $ac3,           31              \n\t"
-
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
         "mthi     $zero,                $ac1                            \n\t"
         "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
         "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
         "extp     %[temp3],             $ac1,           31              \n\t"
-
-        "add      %[step1_0],          %[temp0],        %[temp3]        \n\t"
-        "add      %[step1_1],          %[temp1],        %[temp2]        \n\t"
-        "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"
-        "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"
+        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
+        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [result1] "=&r"(result1),
           [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
+          [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
+          [step1_3] "=&r"(step1_3)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64)
-
-            );
+          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+          [cospi_16_64] "r"(cospi_16_64));
 
     __asm__ __volatile__(
         "lh       %[load1],             8(%[input])                     \n\t"
@@ -665,7 +682,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
         "extp     %[temp0],             $ac1,           31              \n\t"
-
         "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
         "extp     %[temp3],             $ac3,           31              \n\t"
@@ -678,7 +694,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
         "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
         "extp     %[temp1],             $ac2,           31              \n\t"
-
         "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
         "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
         "extp     %[temp2],             $ac1,           31              \n\t"
@@ -691,11 +706,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
         "sub      %[load1],             %[load1],       %[temp0]        \n\t"
         "add      %[load1],             %[load1],       %[temp1]        \n\t"
-
         "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
         "sub      %[load2],             %[load2],       %[temp2]        \n\t"
         "add      %[load2],             %[load2],       %[temp3]        \n\t"
-
         "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
 
@@ -706,129 +719,246 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
+          [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
+          [step1_7] "=&r"(step1_7)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
           [cospi_16_64] "r"(cospi_16_64));
 
-    step2_0 = step1_0 + step1_7;
-    step2_1 = step1_1 + step1_6;
-    step2_2 = step1_2 + step1_5;
-    step2_3 = step1_3 + step1_4;
-    step2_4 = step1_3 - step1_4;
-    step2_5 = step1_2 - step1_5;
-    step2_6 = step1_1 - step1_6;
-    step2_7 = step1_0 - step1_7;
-
-    step1_0 = step2_0 + step3_15;
-    step1_1 = step2_1 + step3_14;
-    step1_2 = step2_2 + step3_13;
-    step1_3 = step2_3 + step3_12;
-    step1_4 = step2_4 + step3_11;
-    step1_5 = step2_5 + step3_10;
-    step1_6 = step2_6 + step3_9;
-    step1_7 = step2_7 + step3_8;
-    step1_8 = step2_7 - step3_8;
-    step1_9 = step2_6 - step3_9;
-    step1_10 = step2_5 - step3_10;
-    step1_11 = step2_4 - step3_11;
-    step1_12 = step2_3 - step3_12;
-    step1_13 = step2_2 - step3_13;
-    step1_14 = step2_1 - step3_14;
-    step1_15 = step2_0 - step3_15;
-
     __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_20],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
+        "add      %[step2_0],          %[step1_0],    %[step1_7]     \n\t"
+        "add      %[step2_1],          %[step1_1],    %[step1_6]     \n\t"
+        "add      %[step2_2],          %[step1_2],    %[step1_5]     \n\t"
+        "add      %[step2_3],          %[step1_3],    %[step1_4]     \n\t"
+        "sub      %[step2_4],          %[step1_3],    %[step1_4]     \n\t"
+        "sub      %[step2_5],          %[step1_2],    %[step1_5]     \n\t"
+        "sub      %[step2_6],          %[step1_1],    %[step1_6]     \n\t"
+        "sub      %[step2_7],          %[step1_0],    %[step1_7]     \n\t"
+
+        : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
+          [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
+          [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
+          [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
+        : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
+          [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
+          [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
+          [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
+
+    // stage 7
+    __asm__ __volatile__(
+        "add      %[step1_0],          %[step2_0],    %[step3_15]     \n\t"
+        "add      %[step1_1],          %[step2_1],    %[step3_14]     \n\t"
+        "add      %[step1_2],          %[step2_2],    %[step3_13]     \n\t"
+        "add      %[step1_3],          %[step2_3],    %[step3_12]     \n\t"
+        "sub      %[step1_12],         %[step2_3],    %[step3_12]     \n\t"
+        "sub      %[step1_13],         %[step2_2],    %[step3_13]     \n\t"
+        "sub      %[step1_14],         %[step2_1],    %[step3_14]     \n\t"
+        "sub      %[step1_15],         %[step2_0],    %[step3_15]     \n\t"
+
+        : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
+          [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
+          [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
+          [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
+        : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
+          [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
+          [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
+          [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
 
-    temp21 = (step2_20 + step2_27) * cospi_16_64;
-    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+    __asm__ __volatile__(
+        "add      %[step1_4],          %[step2_4],    %[step3_11]     \n\t"
+        "add      %[step1_5],          %[step2_5],    %[step3_10]     \n\t"
+        "add      %[step1_6],          %[step2_6],    %[step3_9]      \n\t"
+        "add      %[step1_7],          %[step2_7],    %[step3_8]      \n\t"
+        "sub      %[step1_8],          %[step2_7],    %[step3_8]      \n\t"
+        "sub      %[step1_9],          %[step2_6],    %[step3_9]      \n\t"
+        "sub      %[step1_10],         %[step2_5],    %[step3_10]     \n\t"
+        "sub      %[step1_11],         %[step2_4],    %[step3_11]     \n\t"
+
+        : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
+          [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
+          [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
+          [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
+        : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
+          [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
+          [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
+          [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
 
     __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "add      %[temp1],             %[step2_27],    %[step2_20]     \n\t"
+        "sub      %[temp2],             %[step2_26],    %[step2_21]     \n\t"
+        "add      %[temp3],             %[step2_26],    %[step2_21]     \n\t"
+
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_21],          $ac0,           31              \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
 
-        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
-          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
+        "extp     %[step1_20],          $ac0,           31              \n\t"
+        "extp     %[step1_27],          $ac1,           31              \n\t"
+        "extp     %[step1_21],          $ac2,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
 
-    temp21 = (step2_21 + step2_26) * cospi_16_64;
-    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
+          [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
+          [step1_26] "=&r"(step1_26)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+          [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
+          [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
 
     __asm__ __volatile__(
         "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
+        "add      %[temp1],             %[step2_25],    %[step2_22]     \n\t"
+        "sub      %[temp2],             %[step2_24],    %[step2_23]     \n\t"
+        "add      %[temp3],             %[step2_24],    %[step2_23]     \n\t"
+
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_22],          $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
 
-        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
-          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
 
-    temp21 = (step2_22 + step2_25) * cospi_16_64;
-    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+        "extp     %[step1_22],          $ac0,           31              \n\t"
+        "extp     %[step1_25],          $ac1,           31              \n\t"
+        "extp     %[step1_23],          $ac2,           31              \n\t"
+        "extp     %[step1_24],          $ac3,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
+          [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
+          [step1_24] "=&r"(step1_24)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
+          [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
+          [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
 
+    // final stage
     __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_23],          $ac0,           31              \n\t"
+        "add      %[temp0],            %[step1_0],    %[step2_31]     \n\t"
+        "add      %[temp1],            %[step1_1],    %[step2_30]     \n\t"
+        "add      %[temp2],            %[step1_2],    %[step2_29]     \n\t"
+        "add      %[temp3],            %[step1_3],    %[step2_28]     \n\t"
+        "sub      %[load1],            %[step1_3],    %[step2_28]     \n\t"
+        "sub      %[load2],            %[step1_2],    %[step2_29]     \n\t"
+        "sub      %[load3],            %[step1_1],    %[step2_30]     \n\t"
+        "sub      %[load4],            %[step1_0],    %[step2_31]     \n\t"
+        "sh       %[temp0],          0(%[output])                     \n\t"
+        "sh       %[temp1],         64(%[output])                     \n\t"
+        "sh       %[temp2],        128(%[output])                     \n\t"
+        "sh       %[temp3],        192(%[output])                     \n\t"
+        "sh       %[load1],       1792(%[output])                     \n\t"
+        "sh       %[load2],       1856(%[output])                     \n\t"
+        "sh       %[load3],       1920(%[output])                     \n\t"
+        "sh       %[load4],       1984(%[output])                     \n\t"
+
+        : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+          [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+          [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+        : [step1_0] "r"(step1_0), [step2_31] "r"(step2_31),
+          [step1_1] "r"(step1_1), [step2_30] "r"(step2_30),
+          [step1_2] "r"(step1_2), [step2_29] "r"(step2_29),
+          [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
+          [output] "r"(output));
 
-        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
-          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
+    __asm__ __volatile__(
+        "add      %[temp0],            %[step1_4],    %[step1_27]     \n\t"
+        "add      %[temp1],            %[step1_5],    %[step1_26]     \n\t"
+        "add      %[temp2],            %[step1_6],    %[step1_25]     \n\t"
+        "add      %[temp3],            %[step1_7],    %[step1_24]     \n\t"
+        "sub      %[load1],            %[step1_7],    %[step1_24]     \n\t"
+        "sub      %[load2],            %[step1_6],    %[step1_25]     \n\t"
+        "sub      %[load3],            %[step1_5],    %[step1_26]     \n\t"
+        "sub      %[load4],            %[step1_4],    %[step1_27]     \n\t"
+        "sh       %[temp0],        256(%[output])                     \n\t"
+        "sh       %[temp1],        320(%[output])                     \n\t"
+        "sh       %[temp2],        384(%[output])                     \n\t"
+        "sh       %[temp3],        448(%[output])                     \n\t"
+        "sh       %[load1],       1536(%[output])                     \n\t"
+        "sh       %[load2],       1600(%[output])                     \n\t"
+        "sh       %[load3],       1664(%[output])                     \n\t"
+        "sh       %[load4],       1728(%[output])                     \n\t"
+
+        : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+          [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+          [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+        : [step1_4] "r"(step1_4), [step1_27] "r"(step1_27),
+          [step1_5] "r"(step1_5), [step1_26] "r"(step1_26),
+          [step1_6] "r"(step1_6), [step1_25] "r"(step1_25),
+          [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
+          [output] "r"(output));
 
-    temp21 = (step2_23 + step2_24) * cospi_16_64;
-    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+    __asm__ __volatile__(
+        "add      %[temp0],            %[step1_8],     %[step1_23]     \n\t"
+        "add      %[temp1],            %[step1_9],     %[step1_22]     \n\t"
+        "add      %[temp2],            %[step1_10],    %[step1_21]     \n\t"
+        "add      %[temp3],            %[step1_11],    %[step1_20]     \n\t"
+        "sub      %[load1],            %[step1_11],    %[step1_20]     \n\t"
+        "sub      %[load2],            %[step1_10],    %[step1_21]     \n\t"
+        "sub      %[load3],            %[step1_9],     %[step1_22]     \n\t"
+        "sub      %[load4],            %[step1_8],     %[step1_23]     \n\t"
+        "sh       %[temp0],        512(%[output])                      \n\t"
+        "sh       %[temp1],        576(%[output])                      \n\t"
+        "sh       %[temp2],        640(%[output])                      \n\t"
+        "sh       %[temp3],        704(%[output])                      \n\t"
+        "sh       %[load1],       1280(%[output])                      \n\t"
+        "sh       %[load2],       1344(%[output])                      \n\t"
+        "sh       %[load3],       1408(%[output])                      \n\t"
+        "sh       %[load4],       1472(%[output])                      \n\t"
+
+        : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+          [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+          [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+        : [step1_8] "r"(step1_8), [step1_23] "r"(step1_23),
+          [step1_9] "r"(step1_9), [step1_22] "r"(step1_22),
+          [step1_10] "r"(step1_10), [step1_21] "r"(step1_21),
+          [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
+          [output] "r"(output));
 
-    // final stage
-    output[0 * 32] = step1_0 + step2_31;
-    output[1 * 32] = step1_1 + step2_30;
-    output[2 * 32] = step1_2 + step2_29;
-    output[3 * 32] = step1_3 + step2_28;
-    output[4 * 32] = step1_4 + step1_27;
-    output[5 * 32] = step1_5 + step1_26;
-    output[6 * 32] = step1_6 + step1_25;
-    output[7 * 32] = step1_7 + step1_24;
-    output[8 * 32] = step1_8 + step1_23;
-    output[9 * 32] = step1_9 + step1_22;
-    output[10 * 32] = step1_10 + step1_21;
-    output[11 * 32] = step1_11 + step1_20;
-    output[12 * 32] = step1_12 + step2_19;
-    output[13 * 32] = step1_13 + step2_18;
-    output[14 * 32] = step1_14 + step2_17;
-    output[15 * 32] = step1_15 + step2_16;
-    output[16 * 32] = step1_15 - step2_16;
-    output[17 * 32] = step1_14 - step2_17;
-    output[18 * 32] = step1_13 - step2_18;
-    output[19 * 32] = step1_12 - step2_19;
-    output[20 * 32] = step1_11 - step1_20;
-    output[21 * 32] = step1_10 - step1_21;
-    output[22 * 32] = step1_9 - step1_22;
-    output[23 * 32] = step1_8 - step1_23;
-    output[24 * 32] = step1_7 - step1_24;
-    output[25 * 32] = step1_6 - step1_25;
-    output[26 * 32] = step1_5 - step1_26;
-    output[27 * 32] = step1_4 - step1_27;
-    output[28 * 32] = step1_3 - step2_28;
-    output[29 * 32] = step1_2 - step2_29;
-    output[30 * 32] = step1_1 - step2_30;
-    output[31 * 32] = step1_0 - step2_31;
+    __asm__ __volatile__(
+        "add      %[temp0],            %[step1_12],    %[step2_19]     \n\t"
+        "add      %[temp1],            %[step1_13],    %[step2_18]     \n\t"
+        "add      %[temp2],            %[step1_14],    %[step2_17]     \n\t"
+        "add      %[temp3],            %[step1_15],    %[step2_16]     \n\t"
+        "sub      %[load1],            %[step1_15],    %[step2_16]     \n\t"
+        "sub      %[load2],            %[step1_14],    %[step2_17]     \n\t"
+        "sub      %[load3],            %[step1_13],    %[step2_18]     \n\t"
+        "sub      %[load4],            %[step1_12],    %[step2_19]     \n\t"
+        "sh       %[temp0],        768(%[output])                      \n\t"
+        "sh       %[temp1],        832(%[output])                      \n\t"
+        "sh       %[temp2],        896(%[output])                      \n\t"
+        "sh       %[temp3],        960(%[output])                      \n\t"
+        "sh       %[load1],       1024(%[output])                      \n\t"
+        "sh       %[load2],       1088(%[output])                      \n\t"
+        "sh       %[load3],       1152(%[output])                      \n\t"
+        "sh       %[load4],       1216(%[output])                      \n\t"
+
+        : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+          [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+          [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+        : [step1_12] "r"(step1_12), [step2_19] "r"(step2_19),
+          [step1_13] "r"(step1_13), [step2_18] "r"(step2_18),
+          [step1_14] "r"(step1_14), [step2_17] "r"(step2_17),
+          [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
+          [output] "r"(output));
 
     input += 32;
     output += 1;
@@ -836,7 +966,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 }
 
 void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
+                                  int stride) {
   DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
   int16_t *outptr = out;
   uint32_t pos = 45;
@@ -850,7 +980,7 @@ void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
   idct32_rows_dspr2(input, outptr, 32);
 
   // Columns
-  vpx_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
+  vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
 }
 
 void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
@@ -941,7 +1071,7 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
         "abs        %[absa1],     %[a1]         \n\t"
         "replv.qb   %[vector_a1], %[absa1]      \n\t"
 
-        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+        : [absa1] "=&r"(absa1), [vector_a1] "=&r"(vector_a1)
         : [a1] "r"(a1));
 
     for (r = 32; r--;) {
@@ -980,12 +1110,71 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
             [dest] "+&r"(dest)
           : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
     }
+  } else if (a1 > 255) {
+    int32_t a11, a12, vector_a11, vector_a12;
+
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    a11 = a1 >> 1;
+    a12 = a1 - a11;
+    __asm__ __volatile__(
+        "replv.qb       %[vector_a11],  %[a11]     \n\t"
+        "replv.qb       %[vector_a12],  %[a12]     \n\t"
+
+        : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+        : [a11] "r"(a11), [a12] "r"(a12));
+
+    for (r = 32; r--;) {
+      __asm__ __volatile__(
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_3],    %[vector_3],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_4],    %[vector_4],    %[vector_a12]   \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+
+          "lw             %[t1],          16(%[dest])                     \n\t"
+          "lw             %[t2],          20(%[dest])                     \n\t"
+          "lw             %[t3],          24(%[dest])                     \n\t"
+          "lw             %[t4],          28(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a11]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a11]    \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_3],    %[vector_3],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_4],    %[vector_4],    %[vector_a12]   \n\t"
+          "sw             %[vector_1],    16(%[dest])                     \n\t"
+          "sw             %[vector_2],    20(%[dest])                     \n\t"
+          "sw             %[vector_3],    24(%[dest])                     \n\t"
+          "sw             %[vector_4],    28(%[dest])                     \n\t"
+
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+            [dest] "+&r"(dest)
+          : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+            [vector_a12] "r"(vector_a12));
+    }
   } else {
     /* use quad-byte
      * input and output memory are four byte aligned */
     __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
 
-                         : [vector_a1] "=r"(vector_a1)
+                         : [vector_a1] "=&r"(vector_a1)
                          : [a1] "r"(a1));
 
     for (r = 32; r--;) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c
index 516ea80f4ae..3f985b847b1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c
@@ -15,7 +15,7 @@
 
 #if HAVE_DSPR2
 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
-  int16_t step_0, step_1, step_2, step_3;
+  int step_0, step_1, step_2, step_3;
   int Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
   int i;
@@ -96,23 +96,13 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
 }
 
 void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                     int dest_stride) {
-  int16_t step_0, step_1, step_2, step_3;
+                                     int stride) {
+  int step_0, step_1, step_2, step_3;
   int Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
+  const int const_255 = 255;
   int i;
   uint8_t *dest_pix;
-  uint8_t *cm = vpx_ff_cropTbl;
-
-  /* prefetch vpx_ff_cropTbl */
-  prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl + 32);
-  prefetch_load(vpx_ff_cropTbl + 64);
-  prefetch_load(vpx_ff_cropTbl + 96);
-  prefetch_load(vpx_ff_cropTbl + 128);
-  prefetch_load(vpx_ff_cropTbl + 160);
-  prefetch_load(vpx_ff_cropTbl + 192);
-  prefetch_load(vpx_ff_cropTbl + 224);
 
   for (i = 0; i < 4; ++i) {
     dest_pix = (dest + i);
@@ -172,51 +162,62 @@ void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
+        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
         "addi     %[Temp0],             %[Temp0],       8               \n\t"
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
+        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
         "addi     %[Temp0],             %[Temp0],       8               \n\t"
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
+        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
         "addi     %[Temp0],             %[Temp0],       8               \n\t"
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
 
         : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
           [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
           [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
           [dest_pix] "+r"(dest_pix)
-        : [const_2_power_13] "r"(const_2_power_13),
+        : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255),
           [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
-          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
-          [dest_stride] "r"(dest_stride));
+          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input),
+          [stride] "r"(stride));
 
     input += 4;
   }
 }
 
-void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
+void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
   DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
   int16_t *outptr = out;
   uint32_t pos = 45;
@@ -230,11 +231,10 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
   vpx_idct4_rows_dspr2(input, outptr);
 
   // Columns
-  vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride);
 }
 
-void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
+void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
   int a1, absa1;
   int r;
   int32_t out;
@@ -271,10 +271,43 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "lw             %[t2],          0(%[dest])                      \n\t"
           "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
           "sw             %[vector_a],    0(%[dest])                      \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+    }
+  } else if (a1 > 255) {
+    int32_t a11, a12, vector_a11, vector_a12;
+
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    a11 = a1 >> 3;
+    a12 = a1 - (a11 * 7);
+
+    __asm__ __volatile__(
+        "replv.qb       %[vector_a11],  %[a11]     \n\t"
+        "replv.qb       %[vector_a12],  %[a12]     \n\t"
+
+        : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+        : [a11] "r"(a11), [a12] "r"(a12));
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__(
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "addu_s.qb      %[vector_a],    %[t2],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a12]   \n\t"
+          "sw             %[vector_a],    0(%[dest])                      \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
 
           : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+            [vector_a12] "r"(vector_a12));
     }
   } else {
     /* use quad-byte
@@ -288,10 +321,10 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "lw           %[t2],          0(%[dest])                        \n\t"
           "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
           "sw           %[vector_a],    0(%[dest])                        \n\t"
-          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
+          "add          %[dest],        %[dest],          %[stride]       \n\t"
 
           : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
     }
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c
index 08a6c78b6e4..d4d246965c3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c
@@ -192,24 +192,13 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
   }
 }
 
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int Temp0, Temp1, Temp2, Temp3;
   int i;
   const int const_2_power_13 = 8192;
+  const int const_255 = 255;
   uint8_t *dest_pix;
-  uint8_t *cm = vpx_ff_cropTbl;
-
-  /* prefetch vpx_ff_cropTbl */
-  prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl + 32);
-  prefetch_load(vpx_ff_cropTbl + 64);
-  prefetch_load(vpx_ff_cropTbl + 96);
-  prefetch_load(vpx_ff_cropTbl + 128);
-  prefetch_load(vpx_ff_cropTbl + 160);
-  prefetch_load(vpx_ff_cropTbl + 192);
-  prefetch_load(vpx_ff_cropTbl + 224);
 
   for (i = 0; i < 8; ++i) {
     dest_pix = (dest + i);
@@ -356,70 +345,94 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
 
         : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
           [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
@@ -427,19 +440,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
           [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
           [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
           [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
-        : [const_2_power_13] "r"(const_2_power_13),
+        : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255),
           [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
           [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
-          [dest_stride] "r"(dest_stride));
+          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input),
+          [stride] "r"(stride));
 
     input += 8;
   }
 }
 
-void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
+void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
   int16_t *outptr = out;
   uint32_t pos = 45;
@@ -451,11 +463,10 @@ void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
   idct8_rows_dspr2(input, outptr, 8);
 
   // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, stride);
 }
 
-void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
+void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
   int16_t *outptr = out;
   uint32_t pos = 45;
@@ -490,11 +501,10 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
       : [outptr] "r"(outptr));
 
   // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, stride);
 }
 
-void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
+void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
   uint32_t pos = 45;
   int32_t out;
   int32_t r;
@@ -533,11 +543,47 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
           "sw           %[vector_1],    0(%[dest])                      \n\t"
           "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+          "add          %[dest],        %[dest],        %[stride]       \n\t"
 
           : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
             [vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+    }
+  } else if (a1 > 255) {
+    int32_t a11, a12, vector_a11, vector_a12;
+
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    a11 = a1 >> 2;
+    a12 = a1 - (a11 * 3);
+
+    __asm__ __volatile__(
+        "replv.qb      %[vector_a11],  %[a11]     \n\t"
+        "replv.qb      %[vector_a12],  %[a12]     \n\t"
+
+        : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+        : [a11] "r"(a11), [a12] "r"(a12));
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__(
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+            [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
+          : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+            [vector_a12] "r"(vector_a12));
     }
   } else {
     /* use quad-byte
@@ -555,11 +601,11 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
           "sw           %[vector_1],    0(%[dest])                      \n\t"
           "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+          "add          %[dest],        %[dest],        %[stride]       \n\t"
 
           : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
             [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
     }
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h
index da100f6a980..f077fa4814a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h
@@ -15,19 +15,24 @@
 
 #define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
   {                                                           \
-    v8i16 k0_m = __msa_fill_h(cnst0);                         \
-    v4i32 s0_m, s1_m, s2_m, s3_m;                             \
+    v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m;                 \
+    v8i16 k0_m, k1_m, k2_m, zero = { 0 };                     \
                                                               \
-    s0_m = (v4i32)__msa_fill_h(cnst1);                        \
-    k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m);                  \
+    k0_m = __msa_fill_h(cnst0);                               \
+    k1_m = __msa_fill_h(cnst1);                               \
+    k2_m = __msa_ilvev_h((v8i16)k1_m, k0_m);                  \
+    k0_m = __msa_ilvev_h((v8i16)zero, k0_m);                  \
+    k1_m = __msa_ilvev_h(k1_m, (v8i16)zero);                  \
                                                               \
-    ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                   \
+    ILVRL_H2_SW(reg1, reg0, s5_m, s4_m);                      \
     ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                      \
-    DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);          \
+    DOTP_SH2_SW(s5_m, s4_m, k0_m, k0_m, s1_m, s0_m);          \
+    s1_m = __msa_dpsub_s_w(s1_m, (v8i16)s5_m, k1_m);          \
+    s0_m = __msa_dpsub_s_w(s0_m, (v8i16)s4_m, k1_m);          \
     SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
     out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
                                                               \
-    DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);          \
+    DOTP_SH2_SW(s3_m, s2_m, k2_m, k2_m, s1_m, s0_m);          \
     SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
     out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
index 2909beb0f6c..bb20ea27421 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
@@ -48,6 +48,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
@@ -56,6 +57,7 @@ DSP_SRCS-yes += deblock.c
 DSP_SRCS-yes += postproc.h
 DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
 DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
+DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c
 DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
 endif # CONFIG_POSTPROC
@@ -140,14 +142,11 @@ DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
 DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
 
 ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes  += arm/loopfilter_vertical_4_dual_neon.c
 DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
 DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
 DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
 else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes   += arm/loopfilter_neon.c
-endif  # HAVE_NEON
+DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
 endif  # HAVE_NEON_ASM
 
 DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_msa.h
@@ -203,17 +202,6 @@ endif  # ARCH_X86_64
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)
-else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes  += arm/idct16x16_add_neon.c
-endif  # HAVE_NEON
-endif  # HAVE_NEON_ASM
-DSP_SRCS-$(HAVE_NEON)  += arm/idct16x16_neon.c
-DSP_SRCS-$(HAVE_NEON)  += arm/idct32x32_add_neon.c
-DSP_SRCS-$(HAVE_NEON)  += arm/idct32x32_34_add_neon.c
-
 DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
 DSP_SRCS-$(HAVE_MSA)   += mips/idct8x8_msa.c
@@ -226,6 +214,9 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+else  # CONFIG_VP9_HIGHBITDEPTH
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct4x4_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct8x8_add_neon.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 ifeq ($(HAVE_NEON_ASM),yes)
@@ -235,15 +226,21 @@ DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
 DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
 DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
 DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct16x16_neon.c
 else
 DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c
 endif  # HAVE_NEON_ASM
 DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c
 
 endif  # CONFIG_VP9
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ee403be3975..ee1b2927938 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -96,6 +96,7 @@ specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
 add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_8x8 neon/;
 
 add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_8x8 ssse3/;
@@ -139,6 +140,7 @@ specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2/;
 add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_16x16 neon/;
 
 add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_16x16 ssse3/;
@@ -167,7 +169,7 @@ specialize qw/vpx_d207_predictor_32x32 ssse3/;
 add_proto qw/void vpx_d207e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_32x32 ssse3/;
+specialize qw/vpx_d45_predictor_32x32 neon ssse3/;
 
 add_proto qw/void vpx_d45e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 
@@ -182,6 +184,7 @@ specialize qw/vpx_h_predictor_32x32 neon msa sse2/;
 add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_32x32 neon/;
 
 add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_32x32 ssse3/;
@@ -211,6 +214,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -219,33 +223,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -254,33 +264,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_8x8 sse2/;
+  specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_8x8 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_8x8 sse2/;;
+  specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -289,33 +305,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_16x16 sse2/;
+  specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_16x16 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_16x16 sse2/;
+  specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -324,27 +346,32 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_32x32 sse2/;
+  specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_32x32 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_32x32 sse2/;
+  specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 
 #
@@ -585,193 +612,193 @@ if (vpx_config("CONFIG_VP9") eq "yes") {
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-  add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
   specialize qw/vpx_iwht4x4_16_add sse2/;
 
-  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct4x4_1_add neon/;
 
-  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct8x8_1_add neon/;
 
-  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
-  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
-  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
-  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
   specialize qw/vpx_highbd_idct32x32_1_add sse2/;
 
-  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
-  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
   if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
-    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
-    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
-    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
-    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
   } else {
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct4x4_16_add neon sse2/;
 
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct4x4_1_add neon sse2/;
 
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64";
 
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64";
 
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct8x8_1_add neon sse2/;
 
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_256_add sse2/;
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+    specialize qw/vpx_idct16x16_256_add neon sse2/;
 
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_10_add sse2/;
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+    specialize qw/vpx_idct16x16_10_add neon sse2/;
 
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct16x16_1_add neon sse2/;
 
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64";
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+    specialize qw/vpx_idct32x32_1024_add neon sse2/, "$ssse3_x86_64";
 
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64";
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+    specialize qw/vpx_idct32x32_135_add neon sse2/, "$ssse3_x86_64";
     # Need to add 135 eob idct32x32 implementations.
     $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
 
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64";
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+    specialize qw/vpx_idct32x32_34_add neon sse2/, "$ssse3_x86_64";
 
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct32x32_1_add neon sse2/;
 
-    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct4x4_16_add sse2/;
+    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+    specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
 
-    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct8x8_64_add sse2/;
+    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_64_add neon sse2/;
 
-    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct8x8_12_add sse2/;
+    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_12_add neon sse2/;
 
-    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
     specialize qw/vpx_highbd_idct16x16_256_add sse2/;
 
-    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
     specialize qw/vpx_highbd_idct16x16_10_add sse2/;
   }  # CONFIG_EMULATE_HARDWARE
 } else {
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
   if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
   } else {
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct4x4_1_add sse2 neon dspr2 msa/;
 
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct4x4_16_add sse2 neon dspr2 msa/;
 
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/;
 
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
 
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
 
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/;
 
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/;
 
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
 
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
 
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-    # Need to add 135 eob idct32x32 implementations.
     $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
-    $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon;
     $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
     $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
 
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
 
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
 
-    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_iwht4x4_1_add msa/;
 
-    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_iwht4x4_16_add msa sse2/;
   }  # CONFIG_EMULATE_HARDWARE
 }  # CONFIG_VP9_HIGHBITDEPTH
@@ -1724,15 +1751,13 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
     specialize qw/vpx_plane_add_noise sse2 msa/;
 
     add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vpx_mbpost_proc_down sse2 msa/;
-    $vpx_mbpost_proc_down_sse2=vpx_mbpost_proc_down_xmm;
+    specialize qw/vpx_mbpost_proc_down sse2 neon msa/;
 
     add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;
-    $vpx_mbpost_proc_across_ip_sse2=vpx_mbpost_proc_across_ip_xmm;
+    specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/;
 
     add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
-    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/;
+    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/;
 
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
index 6df360df44f..ebca50930a0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
@@ -230,11 +230,11 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2):
     ret
 %undef flimit
 
-;void vpx_mbpost_proc_down_xmm(unsigned char *dst,
-;                            int pitch, int rows, int cols,int flimit)
+;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
+;                               int pitch, int rows, int cols,int flimit)
 extern sym(vpx_rv)
-global sym(vpx_mbpost_proc_down_xmm) PRIVATE
-sym(vpx_mbpost_proc_down_xmm):
+global sym(vpx_mbpost_proc_down_sse2) PRIVATE
+sym(vpx_mbpost_proc_down_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -462,10 +462,10 @@ sym(vpx_mbpost_proc_down_xmm):
 %undef flimit4
 
 
-;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src,
-;                                int pitch, int rows, int cols,int flimit)
-global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE
-sym(vpx_mbpost_proc_across_ip_xmm):
+;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
+;                                    int pitch, int rows, int cols,int flimit)
+global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE
+sym(vpx_mbpost_proc_across_ip_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index d5fc1440c41..487a474a675 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -402,10 +402,10 @@ void iadst4_sse2(__m128i *in) {
       MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
                              stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
                                                                               \
-      stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                                \
-      stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                                \
-      stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                                \
-      stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                                \
+      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                 \
+      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                 \
+      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                 \
+      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                 \
     }                                                                         \
                                                                               \
     /* Stage3 */                                                              \
@@ -413,10 +413,10 @@ void iadst4_sse2(__m128i *in) {
       const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
       const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
                                                                               \
-      stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                                \
-      stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                                \
-      stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                                \
-      stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                                \
+      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                 \
+      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                 \
+      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                 \
+      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                 \
                                                                               \
       tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
       tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
@@ -438,14 +438,14 @@ void iadst4_sse2(__m128i *in) {
     }                                                                         \
                                                                               \
     /* Stage4  */                                                             \
-    out0 = _mm_adds_epi16(stp1_0, stp2_7);                                    \
-    out1 = _mm_adds_epi16(stp1_1, stp1_6);                                    \
-    out2 = _mm_adds_epi16(stp1_2, stp1_5);                                    \
-    out3 = _mm_adds_epi16(stp1_3, stp2_4);                                    \
-    out4 = _mm_subs_epi16(stp1_3, stp2_4);                                    \
-    out5 = _mm_subs_epi16(stp1_2, stp1_5);                                    \
-    out6 = _mm_subs_epi16(stp1_1, stp1_6);                                    \
-    out7 = _mm_subs_epi16(stp1_0, stp2_7);                                    \
+    out0 = _mm_add_epi16(stp1_0, stp2_7);                                     \
+    out1 = _mm_add_epi16(stp1_1, stp1_6);                                     \
+    out2 = _mm_add_epi16(stp1_2, stp1_5);                                     \
+    out3 = _mm_add_epi16(stp1_3, stp2_4);                                     \
+    out4 = _mm_sub_epi16(stp1_3, stp2_4);                                     \
+    out5 = _mm_sub_epi16(stp1_2, stp1_5);                                     \
+    out6 = _mm_sub_epi16(stp1_1, stp1_6);                                     \
+    out7 = _mm_sub_epi16(stp1_0, stp2_7);                                     \
   }
 
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
@@ -866,8 +866,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
     stp2_0 = _mm_packs_epi32(tmp0, tmp2);
     stp2_2 = _mm_packs_epi32(tmp6, tmp4);
 
-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
+    tmp0 = _mm_add_epi16(stp1_4, stp1_5);
+    tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
 
     stp2_4 = tmp0;
     stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
@@ -878,8 +878,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
   {
     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
 
-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
+    tmp4 = _mm_add_epi16(stp2_0, stp2_2);
+    tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
 
     stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
     stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
@@ -896,10 +896,10 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
   }
 
   // Stage4
-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
+  tmp0 = _mm_add_epi16(stp1_3, stp2_4);
+  tmp1 = _mm_add_epi16(stp1_2, stp1_5);
+  tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
+  tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
 
   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
 
@@ -3449,7 +3449,7 @@ static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
   __m128i ubounded, retval;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
   ubounded = _mm_cmpgt_epi16(value, max);
   retval = _mm_andnot_si128(ubounded, value);
   ubounded = _mm_and_si128(ubounded, max);
@@ -4012,7 +4012,7 @@ void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
   __m128i dc_value, d;
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
   int a, i, j;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   tran_low_t out;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
index 20baf820f6b..dee64e3ad36 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -263,7 +263,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
 
   RET
 
-; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
+; inverse 8x8 2D-DCT transform with only first 12 coeffs non-zero
 cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
   mova       m8, [pd_8192]
   mova      m11, [pw_16]
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c b/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
index 79c60f7a191..4f9d480ade6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
@@ -58,8 +58,12 @@ int arm_cpu_caps(void) {
 
 #elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
 /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
+#ifndef WIN32_EXTRA_LEAN
 #define WIN32_EXTRA_LEAN
+#endif
 #include <windows.h>
 
 int arm_cpu_caps(void) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h
index 4aae30e9474..c1f1b602750 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h
@@ -21,6 +21,8 @@
 /*
  * Win32 specific includes
  */
+#undef NOMINMAX
+#define NOMINMAX
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
index 6ba02cf1fcc..5aabb9e3afa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
@@ -140,6 +140,11 @@ static INLINE uint64_t xgetbv(void) {
 #endif
 
 #if defined(_MSC_VER) && _MSC_VER >= 1700
+#undef NOMINMAX
+#define NOMINMAX
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
 #define getenv(x) NULL
diff --git a/chromium/third_party/libvpx/source/libvpx/vpxenc.c b/chromium/third_party/libvpx/source/libvpx/vpxenc.c
index a0f760574c8..9cd10ab2eb4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpxenc.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpxenc.c
@@ -1657,7 +1657,7 @@ static void get_cx_data(struct stream_state *stream,
   *got_data = 0;
   while ((pkt = vpx_codec_get_cx_data(&stream->encoder, &iter))) {
     static size_t fsize = 0;
-    static int64_t ivf_header_pos = 0;
+    static FileOffset ivf_header_pos = 0;
 
     switch (pkt->kind) {
       case VPX_CODEC_CX_FRAME_PKT:
@@ -1683,7 +1683,7 @@ static void get_cx_data(struct stream_state *stream,
             fsize += pkt->data.frame.sz;
 
             if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
-              const int64_t currpos = ftello(stream->file);
+              const FileOffset currpos = ftello(stream->file);
               fseeko(stream->file, ivf_header_pos, SEEK_SET);
               ivf_write_frame_size(stream->file, fsize);
               fseeko(stream->file, currpos, SEEK_SET);
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2017-04-05 14:08:31 +0200
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2017-04-11 07:46:53 +0000
commit	6a4cabb866f66d4128a97cdc6d9d08ce074f1247 (patch)
tree	ab00f70a5e89278d6a0d16ff0c42578dc4d84a2d /chromium/third_party/libvpx
parent	e733310db58160074f574c429d48f8308c0afe17 (diff)
download	qtwebengine-chromium-6a4cabb866f66d4128a97cdc6d9d08ce074f1247.tar.gz