BASELINE: Update Chromium to 56.0.2924.122

Change-Id: I4e04de8f47e47e501c46ed934c76a431c6337ced Reviewed-by: Michael Brüning <michael.bruning@qt.io>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2017-03-08 10:28:10 +0100
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2017-03-20 13:40:30 +0000
commit: e733310db58160074f574c429d48f8308c0afe17 (patch)
tree: f8aef4b7e62a69928dbcf880620eece20f98c6df /chromium/third_party/libvpx/source/libvpx
parent: 2f583e4aec1ae3a86fa047829c96b310dc12ecdf (diff)
download: qtwebengine-chromium-e733310db58160074f574c429d48f8308c0afe17.tar.gz
114 files changed, 7115 insertions, 4088 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
index 36120170e81..09bdc5d2f70 100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
@@ -71,7 +71,7 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
   include $(CONFIG_DIR)libs-armv7-android-gcc.mk
   LOCAL_ARM_MODE := arm
 else ifeq  ($(TARGET_ARCH_ABI),arm64-v8a)
-  include $(CONFIG_DIR)libs-armv8-android-gcc.mk
+  include $(CONFIG_DIR)libs-arm64-android-gcc.mk
   LOCAL_ARM_MODE := arm
 else ifeq ($(TARGET_ARCH_ABI),x86)
   include $(CONFIG_DIR)libs-x86-android-gcc.mk
@@ -101,8 +101,8 @@ LOCAL_CFLAGS := -O3
 # like x86inc.asm and x86_abi_support.asm
 LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)
 
-.PRECIOUS: %.asm.s
-$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm
+.PRECIOUS: %.asm.S
+$(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm
 	@mkdir -p $(dir $@)
 	@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
 
@@ -132,7 +132,7 @@ endif
 
 # Pull out assembly files, splitting NEON from the rest.  This is
 # done to specify that the NEON assembly files use NEON assembler flags.
-# x86 assembly matches %.asm, arm matches %.asm.s
+# x86 assembly matches %.asm, arm matches %.asm.S
 
 # x86:
 
@@ -140,12 +140,12 @@ CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE))
 LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libvpx/$(file))
 
 # arm:
-CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE))
+CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.S, $(CODEC_SRCS_UNIQUE))
 CODEC_SRCS_ASM_ARM = $(foreach v, \
                      $(CODEC_SRCS_ASM_ARM_ALL), \
                      $(if $(findstring neon,$(v)),,$(v)))
-CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \
-                         $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \
+                         $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \
                          $(CODEC_SRCS_ASM_ARM))
 LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)
 
@@ -153,18 +153,19 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
   CODEC_SRCS_ASM_NEON = $(foreach v, \
                         $(CODEC_SRCS_ASM_ARM_ALL),\
                         $(if $(findstring neon,$(v)),$(v),))
-  CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \
-                                $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+  CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \
+                                $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \
                                 $(CODEC_SRCS_ASM_NEON))
-  LOCAL_SRC_FILES += $(patsubst %.s, \
-                     %.s.neon, \
+  LOCAL_SRC_FILES += $(patsubst %.S, \
+                     %.S.neon, \
                      $(CODEC_SRCS_ASM_NEON_ADS2GAS))
 endif
 
 LOCAL_CFLAGS += \
     -DHAVE_CONFIG_H=vpx_config.h \
     -I$(LIBVPX_PATH) \
-    -I$(ASM_CNV_PATH)
+    -I$(ASM_CNV_PATH) \
+    -I$(ASM_CNV_PATH)/libvpx
 
 LOCAL_MODULE := libvpx
 
@@ -185,7 +186,8 @@ endif
 $$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h
 $$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h
 
-ifneq ($(findstring $(TARGET_ARCH_ABI),x86 x86_64),)
+rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a
+ifneq ($(findstring $(TARGET_ARCH_ABI),$(rtcd_dep_template_CONFIG_ASM_ABIS)),)
 $$(rtcd_dep_template_SRCS): vpx_config.asm
 endif
 endef
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
index 469eb74c3aa..cba605786cb 100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
@@ -90,7 +90,7 @@ all:
 
 .PHONY: clean
 clean::
-	rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.s.o=.asm.s)
+	rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.S.o=.asm.S)
 	rm -f $(CLEAN-OBJS)
 
 .PHONY: clean
@@ -180,13 +180,13 @@ $(BUILD_PFX)%.asm.o: %.asm
 	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
 
-$(BUILD_PFX)%.s.d: %.s
+$(BUILD_PFX)%.S.d: %.S
 	$(if $(quiet),@echo "    [DEP] $@")
 	$(qexec)mkdir -p $(dir $@)
 	$(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \
             --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@
 
-$(BUILD_PFX)%.s.o: %.s
+$(BUILD_PFX)%.S.o: %.S
 	$(if $(quiet),@echo "    [AS] $@")
 	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
@@ -198,8 +198,8 @@ $(BUILD_PFX)%.c.S: %.c
 	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(CC) -S $(CFLAGS) -o $@ $<
 
-.PRECIOUS: %.asm.s
-$(BUILD_PFX)%.asm.s: %.asm
+.PRECIOUS: %.asm.S
+$(BUILD_PFX)%.asm.S: %.asm
 	$(if $(quiet),@echo "    [ASM CONVERSION] $@")
 	$(qexec)mkdir -p $(dir $@)
 	$(qexec)$(ASM_CONVERSION) <$< >$@
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas.pl b/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas.pl
index 7272424af2e..029cc4a56f2 100755
--- a/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas.pl
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas.pl
@@ -138,14 +138,6 @@ while (<STDIN>)
     s/DCD(.*)/.long $1/;
     s/DCB(.*)/.byte $1/;
 
-    # RN to .req
-    if (s/RN\s+([Rr]\d+|lr)/.req $1/)
-    {
-        print;
-        print "$comment_sub$comment\n" if defined $comment;
-        next;
-    }
-
     # Make function visible to linker, and make additional symbol with
     # prepended underscore
     s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas_apple.pl b/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas_apple.pl
index 1a9e105ba8d..e1ae7b4f871 100755
--- a/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas_apple.pl
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas_apple.pl
@@ -120,18 +120,6 @@ while (<STDIN>)
     s/DCD(.*)/.long $1/;
     s/DCB(.*)/.byte $1/;
 
-    # Build a hash of all the register - alias pairs.
-    if (s/(.*)RN(.*)/$1 .req $2/g)
-    {
-        $register_aliases{trim($1)} = trim($2);
-        next;
-    }
-
-    while (($key, $value) = each(%register_aliases))
-    {
-        s/\b$key\b/$value/g;
-    }
-
     # Make function visible to linker, and make additional symbol with
     # prepended underscore
     s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
index 35609e89af4..007e0200023 100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
@@ -635,7 +635,7 @@ setup_gnu_toolchain() {
   AS=${AS:-${CROSS}as}
   STRIP=${STRIP:-${CROSS}strip}
   NM=${NM:-${CROSS}nm}
-  AS_SFX=.s
+  AS_SFX=.S
   EXE_SFX=
 }
 
@@ -926,7 +926,7 @@ EOF
           ;;
         vs*)
           asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
-          AS_SFX=.s
+          AS_SFX=.S
           msvs_arch_dir=arm-msvs
           disable_feature multithread
           disable_feature unit_tests
@@ -936,6 +936,7 @@ EOF
             # only "AppContainerApplication" which requires an AppxManifest.
             # Therefore disable the examples, just build the library.
             disable_feature examples
+            disable_feature tools
           fi
           ;;
         rvct)
@@ -1034,7 +1035,7 @@ EOF
           STRIP="$(${XCRUN_FIND} strip)"
           NM="$(${XCRUN_FIND} nm)"
           RANLIB="$(${XCRUN_FIND} ranlib)"
-          AS_SFX=.s
+          AS_SFX=.S
           LD="${CXX:-$(${XCRUN_FIND} ld)}"
 
           # ASFLAGS is written here instead of using check_add_asflags
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh
index e98611d1024..2cf62c117c2 100755
--- a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh
@@ -82,7 +82,7 @@ generate_filter() {
                        | sed -e "s,$src_path_bare,," \
                              -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
 
-                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then
+                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ] || [ "$pat" == "S" ]) && $asm_use_custom_step; then
                     # Avoid object file name collisions, i.e. vpx_config.c and
                     # vpx_config.asm produce the same object file without
                     # this additional suffix.
@@ -203,7 +203,7 @@ for opt in "$@"; do
             # The paths in file_list are fixed outside of the loop.
             file_list[${#file_list[@]}]="$opt"
             case "$opt" in
-                 *.asm|*.s) uses_asm=true
+                 *.asm|*.[Ss]) uses_asm=true
                  ;;
             esac
         ;;
@@ -452,7 +452,7 @@ generate_vcxproj() {
     done
 
     open_tag ItemGroup
-    generate_filter "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s"
+    generate_filter "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s;S"
     close_tag ItemGroup
     open_tag ItemGroup
     generate_filter "Header Files"   "h;hm;inl;inc;xsd"
diff --git a/chromium/third_party/libvpx/source/libvpx/configure b/chromium/third_party/libvpx/source/libvpx/configure
index 7065dfef538..fb732acf3e5 100755
--- a/chromium/third_party/libvpx/source/libvpx/configure
+++ b/chromium/third_party/libvpx/source/libvpx/configure
@@ -22,6 +22,7 @@ show_help(){
 Advanced options:
   ${toggle_libs}                  libraries
   ${toggle_examples}              examples
+  ${toggle_tools}                 tools
   ${toggle_docs}                  documentation
   ${toggle_unit_tests}            unit tests
   ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
@@ -155,7 +156,7 @@ all_platforms="${all_platforms} generic-gnu"
 
 # all_targets is a list of all targets that can be configured
 # note that these should be in dependency order for now.
-all_targets="libs examples docs"
+all_targets="libs examples tools docs"
 
 # all targets available are enabled, by default.
 for t in ${all_targets}; do
@@ -331,6 +332,7 @@ CMDLINE_SELECT="
 
     libs
     examples
+    tools
     docs
     libc
     as
@@ -476,7 +478,7 @@ EOF
     #
     # Write makefiles for all enabled targets
     #
-    for tgt in libs examples docs solution; do
+    for tgt in libs examples tools docs solution; do
         tgt_fn="$tgt-$toolchain.mk"
 
         if enabled $tgt; then
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
index cecdce0804c..fa2df7271b2 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -84,6 +84,8 @@ static const arg_def_t speed_arg =
     ARG_DEF("sp", "speed", 1, "speed configuration");
 static const arg_def_t aqmode_arg =
     ARG_DEF("aq", "aqmode", 1, "aq-mode off/on");
+static const arg_def_t bitrates_arg =
+    ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]");
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -124,6 +126,7 @@ static const arg_def_t *svc_args[] = { &frames_arg,
 #endif
                                        &speed_arg,
                                        &rc_end_usage_arg,
+                                       &bitrates_arg,
                                        NULL };
 
 static const uint32_t default_frames_to_skip = 0;
@@ -250,6 +253,9 @@ static void parse_command_line(int argc, const char **argv_,
     } else if (arg_match(&arg, &scale_factors_arg, argi)) {
       snprintf(string_options, sizeof(string_options), "%s scale-factors=%s",
                string_options, arg.val);
+    } else if (arg_match(&arg, &bitrates_arg, argi)) {
+      snprintf(string_options, sizeof(string_options), "%s bitrates=%s",
+               string_options, arg.val);
     } else if (arg_match(&arg, &passes_arg, argi)) {
       passes = arg_parse_uint(&arg);
       if (passes < 1 || passes > 2) {
@@ -417,7 +423,6 @@ static void set_rate_control_stats(struct RateControlStats *rc,
   for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
     for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
       const int layer = sl * cfg->ts_number_layers + tl;
-      const int tlayer0 = sl * cfg->ts_number_layers;
       if (cfg->ts_number_layers == 1)
         rc->layer_framerate[layer] = framerate;
       else
@@ -428,8 +433,8 @@ static void set_rate_control_stats(struct RateControlStats *rc,
                       cfg->layer_target_bitrate[layer - 1]) /
             (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]);
       } else {
-        rc->layer_pfb[tlayer0] = 1000.0 * cfg->layer_target_bitrate[tlayer0] /
-                                 rc->layer_framerate[tlayer0];
+        rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] /
+                               rc->layer_framerate[layer];
       }
       rc->layer_input_frames[layer] = 0;
       rc->layer_enc_frames[layer] = 0;
@@ -449,12 +454,13 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
                                           vpx_codec_enc_cfg_t *cfg,
                                           int frame_cnt) {
   unsigned int sl, tl;
-  int tot_num_frames = 0;
   double perc_fluctuation = 0.0;
+  int tot_num_frames = 0;
   printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
   printf("Rate control layer stats for sl%d tl%d layer(s):\n\n",
          cfg->ss_number_layers, cfg->ts_number_layers);
   for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
+    tot_num_frames = 0;
     for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
       const int layer = sl * cfg->ts_number_layers + tl;
       const int num_dropped =
@@ -462,7 +468,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
               ? (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer])
               : (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] -
                  1);
-      if (!sl) tot_num_frames += rc->layer_input_frames[layer];
+      tot_num_frames += rc->layer_input_frames[layer];
       rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] *
                                           rc->layer_encoding_bitrate[layer] /
                                           tot_num_frames;
@@ -620,7 +626,7 @@ int main(int argc, const char **argv) {
   struct RateControlStats rc;
   vpx_svc_layer_id_t layer_id;
   vpx_svc_ref_frame_config_t ref_frame_config;
-  int sl, tl;
+  unsigned int sl, tl;
   double sum_bitrate = 0.0;
   double sum_bitrate2 = 0.0;
   double framerate = 30.0;
@@ -695,6 +701,8 @@ int main(int argc, const char **argv) {
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
   if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+  if (svc_ctx.speed >= 5)
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
 
   // Encode frames
   while (!end_of_stream) {
@@ -730,7 +738,7 @@ int main(int argc, const char **argv) {
                         &ref_frame_config);
       // Keep track of input frames, to account for frame drops in rate control
       // stats/metrics.
-      for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+      for (sl = 0; sl < (unsigned int)enc_cfg.ss_number_layers; ++sl) {
         ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
                                 layer_id.temporal_layer_id];
       }
@@ -793,7 +801,7 @@ int main(int argc, const char **argv) {
                   rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
                   // Keep count of rate control stats per layer, for non-key
                   // frames.
-                  if (tl == layer_id.temporal_layer_id &&
+                  if (tl == (unsigned int)layer_id.temporal_layer_id &&
                       !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
                     rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl];
                     rc.layer_avg_rate_mismatch[layer] +=
@@ -807,7 +815,7 @@ int main(int argc, const char **argv) {
               // Update for short-time encoding bitrate states, for moving
               // window of size rc->window, shifted by rc->window / 2.
               // Ignore first window segment, due to key frame.
-              if (frame_cnt > rc.window_size) {
+              if (frame_cnt > (unsigned int)rc.window_size) {
                 tl = layer_id.temporal_layer_id;
                 for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
                   sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
@@ -823,13 +831,14 @@ int main(int argc, const char **argv) {
               }
 
               // Second shifted window.
-              if (frame_cnt > rc.window_size + rc.window_size / 2) {
+              if (frame_cnt >
+                  (unsigned int)(rc.window_size + rc.window_size / 2)) {
                 tl = layer_id.temporal_layer_id;
                 for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
                   sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
                 }
 
-                if (frame_cnt > 2 * rc.window_size &&
+                if (frame_cnt > (unsigned int)(2 * rc.window_size) &&
                     frame_cnt % rc.window_size == 0) {
                   rc.window_count += 1;
                   rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
@@ -842,10 +851,11 @@ int main(int argc, const char **argv) {
             }
 #endif
           }
-
+          /*
           printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
                  !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
                  (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+          */
           if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1)
             si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
           ++frames_received;
diff --git a/chromium/third_party/libvpx/source/libvpx/libs.mk b/chromium/third_party/libvpx/source/libvpx/libs.mk
index 6e12b540454..f4f48cc1621 100644
--- a/chromium/third_party/libvpx/source/libvpx/libs.mk
+++ b/chromium/third_party/libvpx/source/libvpx/libs.mk
@@ -12,7 +12,7 @@
 # ARM assembly files are written in RVCT-style. We use some make magic to
 # filter those files to allow GCC compilation
 ifeq ($(ARCH_ARM),yes)
-  ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.s,.asm)
+  ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.S,.asm)
 else
   ASM:=.asm
 endif
@@ -366,7 +366,7 @@ endif
 #
 # Add assembler dependencies for configuration.
 #
-$(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
+$(filter %.S.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
 $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
 
 
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/README.libvpx b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/README.libvpx
index 73f83032225..1f8a13d78c1 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/README.libvpx
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 32d5ac49414a8914ec1e1f285f3f927c6e8ec29d
+Version: 9732ae991efb71aced4267d4794918279e362d99
 License: BSD
 License File: LICENSE.txt
 
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/file_util.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/file_util.cc
index 4f91318f3e9..6dab146dd98 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/file_util.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/file_util.cc
@@ -14,6 +14,7 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <fstream>
 #include <ios>
 
@@ -21,13 +22,23 @@ namespace libwebm {
 
 std::string GetTempFileName() {
 #if !defined _MSC_VER && !defined __MINGW32__
-  char temp_file_name_template[] = "libwebm_temp.XXXXXX";
+  std::string temp_file_name_template_str =
+      std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") :
+                                               ".") +
+      "/libwebm_temp.XXXXXX";
+  char* temp_file_name_template =
+      new char[temp_file_name_template_str.length() + 1];
+  memset(temp_file_name_template, 0, temp_file_name_template_str.length() + 1);
+  temp_file_name_template_str.copy(temp_file_name_template,
+                                   temp_file_name_template_str.length(), 0);
   int fd = mkstemp(temp_file_name_template);
+  std::string temp_file_name =
+      (fd != -1) ? std::string(temp_file_name_template) : std::string();
+  delete[] temp_file_name_template;
   if (fd != -1) {
     close(fd);
-    return std::string(temp_file_name_template);
   }
-  return std::string();
+  return temp_file_name;
 #else
   char tmp_file_name[_MAX_PATH];
   errno_t err = tmpnam_s(tmp_file_name);
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.cc
index e1a9842fb6e..e1618ce75a7 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.cc
@@ -7,12 +7,15 @@
 // be found in the AUTHORS file in the root of the source tree.
 #include "hdr_util.h"
 
+#include <climits>
 #include <cstddef>
 #include <new>
 
 #include "mkvparser/mkvparser.h"
 
 namespace libwebm {
+const int Vp9CodecFeatures::kValueNotPresent = INT_MAX;
+
 bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
                              PrimaryChromaticityPtr* muxer_pc) {
   muxer_pc->reset(new (std::nothrow)
@@ -29,9 +32,9 @@ bool MasteringMetadataValuePresent(double value) {
 bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
                            mkvmuxer::MasteringMetadata* muxer_mm) {
   if (MasteringMetadataValuePresent(parser_mm.luminance_max))
-    muxer_mm->luminance_max = parser_mm.luminance_max;
+    muxer_mm->set_luminance_max(parser_mm.luminance_max);
   if (MasteringMetadataValuePresent(parser_mm.luminance_min))
-    muxer_mm->luminance_min = parser_mm.luminance_min;
+    muxer_mm->set_luminance_min(parser_mm.luminance_min);
 
   PrimaryChromaticityPtr r_ptr(NULL);
   PrimaryChromaticityPtr g_ptr(NULL);
@@ -73,34 +76,37 @@ bool CopyColour(const mkvparser::Colour& parser_colour,
     return false;
 
   if (ColourValuePresent(parser_colour.matrix_coefficients))
-    muxer_colour->matrix_coefficients = parser_colour.matrix_coefficients;
+    muxer_colour->set_matrix_coefficients(parser_colour.matrix_coefficients);
   if (ColourValuePresent(parser_colour.bits_per_channel))
-    muxer_colour->bits_per_channel = parser_colour.bits_per_channel;
-  if (ColourValuePresent(parser_colour.chroma_subsampling_horz))
-    muxer_colour->chroma_subsampling_horz =
-        parser_colour.chroma_subsampling_horz;
-  if (ColourValuePresent(parser_colour.chroma_subsampling_vert))
-    muxer_colour->chroma_subsampling_vert =
-        parser_colour.chroma_subsampling_vert;
+    muxer_colour->set_bits_per_channel(parser_colour.bits_per_channel);
+  if (ColourValuePresent(parser_colour.chroma_subsampling_horz)) {
+    muxer_colour->set_chroma_subsampling_horz(
+        parser_colour.chroma_subsampling_horz);
+  }
+  if (ColourValuePresent(parser_colour.chroma_subsampling_vert)) {
+    muxer_colour->set_chroma_subsampling_vert(
+        parser_colour.chroma_subsampling_vert);
+  }
   if (ColourValuePresent(parser_colour.cb_subsampling_horz))
-    muxer_colour->cb_subsampling_horz = parser_colour.cb_subsampling_horz;
+    muxer_colour->set_cb_subsampling_horz(parser_colour.cb_subsampling_horz);
   if (ColourValuePresent(parser_colour.cb_subsampling_vert))
-    muxer_colour->cb_subsampling_vert = parser_colour.cb_subsampling_vert;
+    muxer_colour->set_cb_subsampling_vert(parser_colour.cb_subsampling_vert);
   if (ColourValuePresent(parser_colour.chroma_siting_horz))
-    muxer_colour->chroma_siting_horz = parser_colour.chroma_siting_horz;
+    muxer_colour->set_chroma_siting_horz(parser_colour.chroma_siting_horz);
   if (ColourValuePresent(parser_colour.chroma_siting_vert))
-    muxer_colour->chroma_siting_vert = parser_colour.chroma_siting_vert;
+    muxer_colour->set_chroma_siting_vert(parser_colour.chroma_siting_vert);
   if (ColourValuePresent(parser_colour.range))
-    muxer_colour->range = parser_colour.range;
-  if (ColourValuePresent(parser_colour.transfer_characteristics))
-    muxer_colour->transfer_characteristics =
-        parser_colour.transfer_characteristics;
+    muxer_colour->set_range(parser_colour.range);
+  if (ColourValuePresent(parser_colour.transfer_characteristics)) {
+    muxer_colour->set_transfer_characteristics(
+        parser_colour.transfer_characteristics);
+  }
   if (ColourValuePresent(parser_colour.primaries))
-    muxer_colour->primaries = parser_colour.primaries;
+    muxer_colour->set_primaries(parser_colour.primaries);
   if (ColourValuePresent(parser_colour.max_cll))
-    muxer_colour->max_cll = parser_colour.max_cll;
+    muxer_colour->set_max_cll(parser_colour.max_cll);
   if (ColourValuePresent(parser_colour.max_fall))
-    muxer_colour->max_fall = parser_colour.max_fall;
+    muxer_colour->set_max_fall(parser_colour.max_fall);
 
   if (parser_colour.mastering_metadata) {
     mkvmuxer::MasteringMetadata muxer_mm;
@@ -116,8 +122,8 @@ bool CopyColour(const mkvparser::Colour& parser_colour,
 //
 //   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 //  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-//  |    ID Byte    |             Length            |               |
-//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+               |
+//  |    ID Byte    |   Length      |                               |
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+                               |
 //  |                                                               |
 //  :               Bytes 1..Length of Codec Feature                :
 //  |                                                               |
@@ -132,51 +138,83 @@ bool CopyColour(const mkvparser::Colour& parser_colour,
 //
 // The X bit is reserved.
 //
-// Currently only profile level is supported. ID byte must be set to 1, and
-// length must be 1. Supported values are:
-//
-//   10: Level 1
-//   11: Level 1.1
-//   20: Level 2
-//   21: Level 2.1
-//   30: Level 3
-//   31: Level 3.1
-//   40: Level 4
-//   41: Level 4.1
-//   50: Level 5
-//   51: Level 5.1
-//   52: Level 5.2
-//   60: Level 6
-//   61: Level 6.1
-//   62: Level 6.2
-//
 // See the following link for more information:
 // http://www.webmproject.org/vp9/profiles/
-int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length) {
-  const int kVpxCodecPrivateLength = 3;
-  if (!private_data || length != kVpxCodecPrivateLength)
-    return 0;
-
-  const uint8_t id_byte = *private_data;
-  if (id_byte != 1)
-    return 0;
-
-  const int kVpxProfileLength = 1;
-  const uint8_t length_byte = private_data[1];
-  if (length_byte != kVpxProfileLength)
-    return 0;
-
-  const int level = static_cast<int>(private_data[2]);
-
-  const int kNumLevels = 14;
-  const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40,
-                                  41, 50, 51, 52, 60, 61, 62};
+bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
+                          Vp9CodecFeatures* features) {
+  const int kVpxCodecPrivateMinLength = 3;
+  if (!private_data || !features || length < kVpxCodecPrivateMinLength)
+    return false;
 
-  for (int i = 0; i < kNumLevels; ++i) {
-    if (level == levels[i])
-      return level;
-  }
+  const uint8_t kVp9ProfileId = 1;
+  const uint8_t kVp9LevelId = 2;
+  const uint8_t kVp9BitDepthId = 3;
+  const uint8_t kVp9ChromaSubsamplingId = 4;
+  const int kVpxFeatureLength = 1;
+  int offset = 0;
+
+  // Set features to not set.
+  features->profile = Vp9CodecFeatures::kValueNotPresent;
+  features->level = Vp9CodecFeatures::kValueNotPresent;
+  features->bit_depth = Vp9CodecFeatures::kValueNotPresent;
+  features->chroma_subsampling = Vp9CodecFeatures::kValueNotPresent;
+  do {
+    const uint8_t id_byte = private_data[offset++];
+    const uint8_t length_byte = private_data[offset++];
+    if (length_byte != kVpxFeatureLength)
+      return false;
+    if (id_byte == kVp9ProfileId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile < 0 || priv_profile > 3)
+        return false;
+      if (features->profile != Vp9CodecFeatures::kValueNotPresent &&
+          features->profile != priv_profile) {
+        return false;
+      }
+      features->profile = priv_profile;
+    } else if (id_byte == kVp9LevelId) {
+      const int priv_level = static_cast<int>(private_data[offset++]);
+
+      const int kNumLevels = 14;
+      const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40,
+                                      41, 50, 51, 52, 60, 61, 62};
+
+      for (int i = 0; i < kNumLevels; ++i) {
+        if (priv_level == levels[i]) {
+          if (features->level != Vp9CodecFeatures::kValueNotPresent &&
+              features->level != priv_level) {
+            return false;
+          }
+          features->level = priv_level;
+          break;
+        }
+      }
+      if (features->level == Vp9CodecFeatures::kValueNotPresent)
+        return false;
+    } else if (id_byte == kVp9BitDepthId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile != 8 && priv_profile != 10 && priv_profile != 12)
+        return false;
+      if (features->bit_depth != Vp9CodecFeatures::kValueNotPresent &&
+          features->bit_depth != priv_profile) {
+        return false;
+      }
+      features->bit_depth = priv_profile;
+    } else if (id_byte == kVp9ChromaSubsamplingId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile != 0 && priv_profile != 2 && priv_profile != 3)
+        return false;
+      if (features->chroma_subsampling != Vp9CodecFeatures::kValueNotPresent &&
+          features->chroma_subsampling != priv_profile) {
+        return false;
+      }
+      features->chroma_subsampling = priv_profile;
+    } else {
+      // Invalid ID.
+      return false;
+    }
+  } while (offset + kVpxCodecPrivateMinLength <= length);
 
-  return 0;
+  return true;
 }
 }  // namespace libwebm
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.h b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.h
index d30c2b9f2a0..689fb30a3fc 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.h
@@ -28,6 +28,25 @@ namespace libwebm {
 // TODO(tomfinegan): These should be moved to libwebm_utils once c++11 is
 // required by libwebm.
 
+// Features of the VP9 codec that may be set in the CodecPrivate of a VP9 video
+// stream. A value of kValueNotPresent represents that the value was not set in
+// the CodecPrivate.
+struct Vp9CodecFeatures {
+  static const int kValueNotPresent;
+
+  Vp9CodecFeatures()
+      : profile(kValueNotPresent),
+        level(kValueNotPresent),
+        bit_depth(kValueNotPresent),
+        chroma_subsampling(kValueNotPresent) {}
+  ~Vp9CodecFeatures() {}
+
+  int profile;
+  int level;
+  int bit_depth;
+  int chroma_subsampling;
+};
+
 typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
 
 bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
@@ -43,8 +62,9 @@ bool ColourValuePresent(long long value);
 bool CopyColour(const mkvparser::Colour& parser_colour,
                 mkvmuxer::Colour* muxer_colour);
 
-// Returns VP9 profile upon success or 0 upon failure.
-int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length);
+// Returns true if |features| is set to one or more valid values.
+bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
+                          Vp9CodecFeatures* features);
 
 }  // namespace libwebm
 
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/webmids.h b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/webmids.h
index 32a0c5fb911..89d722a71bc 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/webmids.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/webmids.h
@@ -124,6 +124,14 @@ enum MkvId {
   kMkvLuminanceMin = 0x55DA,
   // end mastering metadata
   // end colour
+  // projection
+  kMkvProjection = 0x7670,
+  kMkvProjectionType = 0x7671,
+  kMkvProjectionPrivate = 0x7672,
+  kMkvProjectionPoseYaw = 0x7673,
+  kMkvProjectionPosePitch = 0x7674,
+  kMkvProjectionPoseRoll = 0x7675,
+  // end projection
   // audio
   kMkvAudio = 0xE1,
   kMkvSamplingFrequency = 0xB5,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index c79ce24ed35..299b45c989c 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -16,6 +16,7 @@
 #include <ctime>
 #include <memory>
 #include <new>
+#include <string>
 #include <vector>
 
 #include "common/webmids.h"
@@ -25,10 +26,19 @@
 
 namespace mkvmuxer {
 
+const float PrimaryChromaticity::kChromaticityMin = 0.0f;
+const float PrimaryChromaticity::kChromaticityMax = 1.0f;
+const float MasteringMetadata::kMinLuminance = 0.0f;
+const float MasteringMetadata::kMinLuminanceMax = 999.99f;
+const float MasteringMetadata::kMaxLuminanceMax = 9999.99f;
 const float MasteringMetadata::kValueNotPresent = FLT_MAX;
 const uint64_t Colour::kValueNotPresent = UINT64_MAX;
 
 namespace {
+
+const char kDocTypeWebm[] = "webm";
+const char kDocTypeMatroska[] = "matroska";
+
 // Deallocate the string designated by |dst|, and then copy the |src|
 // string to |dst|.  The caller owns both the |src| string and the
 // |dst| copy (hence the caller is responsible for eventually
@@ -63,7 +73,7 @@ bool CopyChromaticity(const PrimaryChromaticity* src,
   if (!dst)
     return false;
 
-  dst->reset(new (std::nothrow) PrimaryChromaticity(src->x, src->y));
+  dst->reset(new (std::nothrow) PrimaryChromaticity(src->x(), src->y()));
   if (!dst->get())
     return false;
 
@@ -80,36 +90,57 @@ IMkvWriter::IMkvWriter() {}
 
 IMkvWriter::~IMkvWriter() {}
 
-bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) {
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version,
+                     const char* const doc_type) {
   // Level 0
-  uint64_t size = EbmlElementSize(libwebm::kMkvEBMLVersion, UINT64_C(1));
-  size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, UINT64_C(1));
-  size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, UINT64_C(4));
-  size += EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, UINT64_C(8));
-  size += EbmlElementSize(libwebm::kMkvDocType, "webm");
-  size += EbmlElementSize(libwebm::kMkvDocTypeVersion, doc_type_version);
-  size += EbmlElementSize(libwebm::kMkvDocTypeReadVersion, UINT64_C(2));
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvEBMLVersion, static_cast<uint64>(1));
+  size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, static_cast<uint64>(1));
+  size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, static_cast<uint64>(4));
+  size +=
+      EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, static_cast<uint64>(8));
+  size += EbmlElementSize(libwebm::kMkvDocType, doc_type);
+  size += EbmlElementSize(libwebm::kMkvDocTypeVersion,
+                          static_cast<uint64>(doc_type_version));
+  size +=
+      EbmlElementSize(libwebm::kMkvDocTypeReadVersion, static_cast<uint64>(2));
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvEBML, size))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion, UINT64_C(1)))
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion,
+                        static_cast<uint64>(1))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion, UINT64_C(1)))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion,
+                        static_cast<uint64>(1))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength, UINT64_C(4)))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength,
+                        static_cast<uint64>(4))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength, UINT64_C(8)))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength,
+                        static_cast<uint64>(8))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvDocType, "webm"))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocType, doc_type))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion, doc_type_version))
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion,
+                        static_cast<uint64>(doc_type_version))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion, UINT64_C(2)))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion,
+                        static_cast<uint64>(2))) {
     return false;
+  }
 
   return true;
 }
 
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) {
+  return WriteEbmlHeader(writer, doc_type_version, kDocTypeWebm);
+}
+
 bool WriteEbmlHeader(IMkvWriter* writer) {
   return WriteEbmlHeader(writer, mkvmuxer::Segment::kDefaultDocTypeVersion);
 }
@@ -262,15 +293,17 @@ bool CuePoint::Write(IMkvWriter* writer) const {
   if (!writer || track_ < 1 || cluster_pos_ < 1)
     return false;
 
-  uint64_t size =
-      EbmlElementSize(libwebm::kMkvCueClusterPosition, cluster_pos_);
-  size += EbmlElementSize(libwebm::kMkvCueTrack, track_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition,
+                                  static_cast<uint64>(cluster_pos_));
+  size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_));
   if (output_block_number_ && block_number_ > 1)
-    size += EbmlElementSize(libwebm::kMkvCueBlockNumber, block_number_);
+    size += EbmlElementSize(libwebm::kMkvCueBlockNumber,
+                            static_cast<uint64>(block_number_));
   const uint64_t track_pos_size =
       EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
   const uint64_t payload_size =
-      EbmlElementSize(libwebm::kMkvCueTime, time_) + track_pos_size;
+      EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) +
+      track_pos_size;
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvCuePoint, payload_size))
     return false;
@@ -279,18 +312,27 @@ bool CuePoint::Write(IMkvWriter* writer) const {
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvCueTime, time_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueTime,
+                        static_cast<uint64>(time_))) {
     return false;
+  }
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvCueTrackPositions, size))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack, track_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack,
+                        static_cast<uint64>(track_))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition, cluster_pos_))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition,
+                        static_cast<uint64>(cluster_pos_))) {
     return false;
-  if (output_block_number_ && block_number_ > 1)
-    if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber, block_number_))
+  }
+  if (output_block_number_ && block_number_ > 1) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber,
+                          static_cast<uint64>(block_number_))) {
       return false;
+    }
+  }
 
   const int64_t stop_position = writer->Position();
   if (stop_position < 0)
@@ -303,15 +345,17 @@ bool CuePoint::Write(IMkvWriter* writer) const {
 }
 
 uint64_t CuePoint::PayloadSize() const {
-  uint64_t size =
-      EbmlElementSize(libwebm::kMkvCueClusterPosition, cluster_pos_);
-  size += EbmlElementSize(libwebm::kMkvCueTrack, track_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition,
+                                  static_cast<uint64>(cluster_pos_));
+  size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_));
   if (output_block_number_ && block_number_ > 1)
-    size += EbmlElementSize(libwebm::kMkvCueBlockNumber, block_number_);
+    size += EbmlElementSize(libwebm::kMkvCueBlockNumber,
+                            static_cast<uint64>(block_number_));
   const uint64_t track_pos_size =
       EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
   const uint64_t payload_size =
-      EbmlElementSize(libwebm::kMkvCueTime, time_) + track_pos_size;
+      EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) +
+      track_pos_size;
 
   return payload_size;
 }
@@ -456,8 +500,9 @@ bool ContentEncAESSettings::Write(IMkvWriter* writer) const {
     return false;
 
   if (!WriteEbmlElement(writer, libwebm::kMkvAESSettingsCipherMode,
-                        cipher_mode_))
+                        static_cast<uint64>(cipher_mode_))) {
     return false;
+  }
 
   const int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
@@ -468,8 +513,8 @@ bool ContentEncAESSettings::Write(IMkvWriter* writer) const {
 }
 
 uint64_t ContentEncAESSettings::PayloadSize() const {
-  uint64_t size =
-      EbmlElementSize(libwebm::kMkvAESSettingsCipherMode, cipher_mode_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvAESSettingsCipherMode,
+                                  static_cast<uint64>(cipher_mode_));
   return size;
 }
 
@@ -529,20 +574,22 @@ bool ContentEncoding::Write(IMkvWriter* writer) const {
                               encoding_size))
     return false;
   if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingOrder,
-                        encoding_order_))
+                        static_cast<uint64>(encoding_order_)))
     return false;
   if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingScope,
-                        encoding_scope_))
+                        static_cast<uint64>(encoding_scope_)))
     return false;
   if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingType,
-                        encoding_type_))
+                        static_cast<uint64>(encoding_type_)))
     return false;
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncryption,
                               encryption_size))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo, enc_algo_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo,
+                        static_cast<uint64>(enc_algo_))) {
     return false;
+  }
   if (!WriteEbmlElement(writer, libwebm::kMkvContentEncKeyID, enc_key_id_,
                         enc_key_id_length_))
     return false;
@@ -571,12 +618,12 @@ uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size,
         EbmlMasterElementSize(libwebm::kMkvContentEncryption, encryption_size) +
         encryption_size;
   }
-  encoding_size +=
-      EbmlElementSize(libwebm::kMkvContentEncodingType, encoding_type_);
-  encoding_size +=
-      EbmlElementSize(libwebm::kMkvContentEncodingScope, encoding_scope_);
-  encoding_size +=
-      EbmlElementSize(libwebm::kMkvContentEncodingOrder, encoding_order_);
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingType,
+                                   static_cast<uint64>(encoding_type_));
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingScope,
+                                   static_cast<uint64>(encoding_scope_));
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingOrder,
+                                   static_cast<uint64>(encoding_order_));
 
   return encoding_size;
 }
@@ -586,7 +633,8 @@ uint64_t ContentEncoding::EncryptionSize() const {
 
   uint64_t encryption_size = EbmlElementSize(libwebm::kMkvContentEncKeyID,
                                              enc_key_id_, enc_key_id_length_);
-  encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo, enc_algo_);
+  encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo,
+                                     static_cast<uint64>(enc_algo_));
 
   return encryption_size + aes_size;
 }
@@ -664,9 +712,10 @@ ContentEncoding* Track::GetContentEncodingByIndex(uint32_t index) const {
 }
 
 uint64_t Track::PayloadSize() const {
-  uint64_t size = EbmlElementSize(libwebm::kMkvTrackNumber, number_);
-  size += EbmlElementSize(libwebm::kMkvTrackUID, uid_);
-  size += EbmlElementSize(libwebm::kMkvTrackType, type_);
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_));
+  size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_));
+  size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_));
   if (codec_id_)
     size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
   if (codec_private_)
@@ -676,15 +725,22 @@ uint64_t Track::PayloadSize() const {
     size += EbmlElementSize(libwebm::kMkvLanguage, language_);
   if (name_)
     size += EbmlElementSize(libwebm::kMkvName, name_);
-  if (max_block_additional_id_)
+  if (max_block_additional_id_) {
     size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
-                            max_block_additional_id_);
-  if (codec_delay_)
-    size += EbmlElementSize(libwebm::kMkvCodecDelay, codec_delay_);
-  if (seek_pre_roll_)
-    size += EbmlElementSize(libwebm::kMkvSeekPreRoll, seek_pre_roll_);
-  if (default_duration_)
-    size += EbmlElementSize(libwebm::kMkvDefaultDuration, default_duration_);
+                            static_cast<uint64>(max_block_additional_id_));
+  }
+  if (codec_delay_) {
+    size += EbmlElementSize(libwebm::kMkvCodecDelay,
+                            static_cast<uint64>(codec_delay_));
+  }
+  if (seek_pre_roll_) {
+    size += EbmlElementSize(libwebm::kMkvSeekPreRoll,
+                            static_cast<uint64>(seek_pre_roll_));
+  }
+  if (default_duration_) {
+    size += EbmlElementSize(libwebm::kMkvDefaultDuration,
+                            static_cast<uint64>(default_duration_));
+  }
 
   if (content_encoding_entries_size_ > 0) {
     uint64_t content_encodings_size = 0;
@@ -722,55 +778,64 @@ bool Track::Write(IMkvWriter* writer) const {
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvTrackEntry, payload_size))
     return false;
 
-  uint64_t size = EbmlElementSize(libwebm::kMkvTrackNumber, number_);
-  size += EbmlElementSize(libwebm::kMkvTrackUID, uid_);
-  size += EbmlElementSize(libwebm::kMkvTrackType, type_);
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_));
+  size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_));
+  size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_));
   if (codec_id_)
     size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
   if (codec_private_)
     size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_,
-                            codec_private_length_);
+                            static_cast<uint64>(codec_private_length_));
   if (language_)
     size += EbmlElementSize(libwebm::kMkvLanguage, language_);
   if (name_)
     size += EbmlElementSize(libwebm::kMkvName, name_);
   if (max_block_additional_id_)
     size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
-                            max_block_additional_id_);
+                            static_cast<uint64>(max_block_additional_id_));
   if (codec_delay_)
-    size += EbmlElementSize(libwebm::kMkvCodecDelay, codec_delay_);
+    size += EbmlElementSize(libwebm::kMkvCodecDelay,
+                            static_cast<uint64>(codec_delay_));
   if (seek_pre_roll_)
-    size += EbmlElementSize(libwebm::kMkvSeekPreRoll, seek_pre_roll_);
+    size += EbmlElementSize(libwebm::kMkvSeekPreRoll,
+                            static_cast<uint64>(seek_pre_roll_));
   if (default_duration_)
-    size += EbmlElementSize(libwebm::kMkvDefaultDuration, default_duration_);
+    size += EbmlElementSize(libwebm::kMkvDefaultDuration,
+                            static_cast<uint64>(default_duration_));
 
   const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber, number_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber,
+                        static_cast<uint64>(number_)))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID, uid_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID,
+                        static_cast<uint64>(uid_)))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvTrackType, type_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackType,
+                        static_cast<uint64>(type_)))
     return false;
   if (max_block_additional_id_) {
     if (!WriteEbmlElement(writer, libwebm::kMkvMaxBlockAdditionID,
-                          max_block_additional_id_)) {
+                          static_cast<uint64>(max_block_additional_id_))) {
       return false;
     }
   }
   if (codec_delay_) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay, codec_delay_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay,
+                          static_cast<uint64>(codec_delay_)))
       return false;
   }
   if (seek_pre_roll_) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll, seek_pre_roll_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll,
+                          static_cast<uint64>(seek_pre_roll_)))
       return false;
   }
   if (default_duration_) {
     if (!WriteEbmlElement(writer, libwebm::kMkvDefaultDuration,
-                          default_duration_))
+                          static_cast<uint64>(default_duration_)))
       return false;
   }
   if (codec_id_) {
@@ -779,7 +844,7 @@ bool Track::Write(IMkvWriter* writer) const {
   }
   if (codec_private_) {
     if (!WriteEbmlElement(writer, libwebm::kMkvCodecPrivate, codec_private_,
-                          codec_private_length_))
+                          static_cast<uint64>(codec_private_length_)))
       return false;
   }
   if (language_) {
@@ -890,14 +955,23 @@ void Track::set_name(const char* name) {
 //
 // Colour and its child elements
 
-uint64_t PrimaryChromaticity::PrimaryChromaticityPayloadSize(
+uint64_t PrimaryChromaticity::PrimaryChromaticitySize(
     libwebm::MkvId x_id, libwebm::MkvId y_id) const {
-  return EbmlElementSize(x_id, x) + EbmlElementSize(y_id, y);
+  return EbmlElementSize(x_id, x_) + EbmlElementSize(y_id, y_);
 }
 
 bool PrimaryChromaticity::Write(IMkvWriter* writer, libwebm::MkvId x_id,
                                 libwebm::MkvId y_id) const {
-  return WriteEbmlElement(writer, x_id, x) && WriteEbmlElement(writer, y_id, y);
+  if (!Valid()) {
+    return false;
+  }
+  return WriteEbmlElement(writer, x_id, x_) &&
+         WriteEbmlElement(writer, y_id, y_);
+}
+
+bool PrimaryChromaticity::Valid() const {
+  return (x_ >= kChromaticityMin && x_ <= kChromaticityMax &&
+          y_ >= kChromaticityMin && y_ <= kChromaticityMax);
 }
 
 uint64_t MasteringMetadata::MasteringMetadataSize() const {
@@ -909,6 +983,31 @@ uint64_t MasteringMetadata::MasteringMetadataSize() const {
   return size;
 }
 
+bool MasteringMetadata::Valid() const {
+  if (luminance_min_ != kValueNotPresent) {
+    if (luminance_min_ < kMinLuminance || luminance_min_ > kMinLuminanceMax ||
+        luminance_min_ > luminance_max_) {
+      return false;
+    }
+  }
+  if (luminance_max_ != kValueNotPresent) {
+    if (luminance_max_ < kMinLuminance || luminance_max_ > kMaxLuminanceMax ||
+        luminance_max_ < luminance_min_) {
+      return false;
+    }
+  }
+  if (r_ && !r_->Valid())
+    return false;
+  if (g_ && !g_->Valid())
+    return false;
+  if (b_ && !b_->Valid())
+    return false;
+  if (white_point_ && !white_point_->Valid())
+    return false;
+
+  return true;
+}
+
 bool MasteringMetadata::Write(IMkvWriter* writer) const {
   const uint64_t size = PayloadSize();
 
@@ -918,12 +1017,12 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const {
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvMasteringMetadata, size))
     return false;
-  if (luminance_max != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max)) {
+  if (luminance_max_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max_)) {
     return false;
   }
-  if (luminance_min != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min)) {
+  if (luminance_min_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) {
     return false;
   }
   if (r_ &&
@@ -984,25 +1083,25 @@ bool MasteringMetadata::SetChromaticity(
 uint64_t MasteringMetadata::PayloadSize() const {
   uint64_t size = 0;
 
-  if (luminance_max != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max);
-  if (luminance_min != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min);
+  if (luminance_max_ != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max_);
+  if (luminance_min_ != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min_);
 
   if (r_) {
-    size += r_->PrimaryChromaticityPayloadSize(
-        libwebm::kMkvPrimaryRChromaticityX, libwebm::kMkvPrimaryRChromaticityY);
+    size += r_->PrimaryChromaticitySize(libwebm::kMkvPrimaryRChromaticityX,
+                                        libwebm::kMkvPrimaryRChromaticityY);
   }
   if (g_) {
-    size += g_->PrimaryChromaticityPayloadSize(
-        libwebm::kMkvPrimaryGChromaticityX, libwebm::kMkvPrimaryGChromaticityY);
+    size += g_->PrimaryChromaticitySize(libwebm::kMkvPrimaryGChromaticityX,
+                                        libwebm::kMkvPrimaryGChromaticityY);
   }
   if (b_) {
-    size += b_->PrimaryChromaticityPayloadSize(
-        libwebm::kMkvPrimaryBChromaticityX, libwebm::kMkvPrimaryBChromaticityY);
+    size += b_->PrimaryChromaticitySize(libwebm::kMkvPrimaryBChromaticityX,
+                                        libwebm::kMkvPrimaryBChromaticityY);
   }
   if (white_point_) {
-    size += white_point_->PrimaryChromaticityPayloadSize(
+    size += white_point_->PrimaryChromaticitySize(
         libwebm::kMkvWhitePointChromaticityX,
         libwebm::kMkvWhitePointChromaticityY);
   }
@@ -1019,6 +1118,33 @@ uint64_t Colour::ColourSize() const {
   return size;
 }
 
+bool Colour::Valid() const {
+  if (mastering_metadata_ && !mastering_metadata_->Valid())
+    return false;
+  if (matrix_coefficients_ != kValueNotPresent &&
+      !IsMatrixCoefficientsValueValid(matrix_coefficients_)) {
+    return false;
+  }
+  if (chroma_siting_horz_ != kValueNotPresent &&
+      !IsChromaSitingHorzValueValid(chroma_siting_horz_)) {
+    return false;
+  }
+  if (chroma_siting_vert_ != kValueNotPresent &&
+      !IsChromaSitingVertValueValid(chroma_siting_vert_)) {
+    return false;
+  }
+  if (range_ != kValueNotPresent && !IsColourRangeValueValid(range_))
+    return false;
+  if (transfer_characteristics_ != kValueNotPresent &&
+      !IsTransferCharacteristicsValueValid(transfer_characteristics_)) {
+    return false;
+  }
+  if (primaries_ != kValueNotPresent && !IsPrimariesValueValid(primaries_))
+    return false;
+
+  return true;
+}
+
 bool Colour::Write(IMkvWriter* writer) const {
   const uint64_t size = PayloadSize();
 
@@ -1026,69 +1152,77 @@ bool Colour::Write(IMkvWriter* writer) const {
   if (size == 0)
     return true;
 
+  // Don't write an invalid element.
+  if (!Valid())
+    return false;
+
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvColour, size))
     return false;
 
-  if (matrix_coefficients != kValueNotPresent &&
+  if (matrix_coefficients_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvMatrixCoefficients,
-                        matrix_coefficients)) {
+                        static_cast<uint64>(matrix_coefficients_))) {
     return false;
   }
-  if (bits_per_channel != kValueNotPresent &&
+  if (bits_per_channel_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvBitsPerChannel,
-                        bits_per_channel)) {
+                        static_cast<uint64>(bits_per_channel_))) {
     return false;
   }
-  if (chroma_subsampling_horz != kValueNotPresent &&
+  if (chroma_subsampling_horz_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingHorz,
-                        chroma_subsampling_horz)) {
+                        static_cast<uint64>(chroma_subsampling_horz_))) {
     return false;
   }
-  if (chroma_subsampling_vert != kValueNotPresent &&
+  if (chroma_subsampling_vert_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingVert,
-                        chroma_subsampling_vert)) {
+                        static_cast<uint64>(chroma_subsampling_vert_))) {
     return false;
   }
 
-  if (cb_subsampling_horz != kValueNotPresent &&
+  if (cb_subsampling_horz_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingHorz,
-                        cb_subsampling_horz)) {
+                        static_cast<uint64>(cb_subsampling_horz_))) {
     return false;
   }
-  if (cb_subsampling_vert != kValueNotPresent &&
+  if (cb_subsampling_vert_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingVert,
-                        cb_subsampling_vert)) {
+                        static_cast<uint64>(cb_subsampling_vert_))) {
     return false;
   }
-  if (chroma_siting_horz != kValueNotPresent &&
+  if (chroma_siting_horz_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvChromaSitingHorz,
-                        chroma_siting_horz)) {
+                        static_cast<uint64>(chroma_siting_horz_))) {
     return false;
   }
-  if (chroma_siting_vert != kValueNotPresent &&
+  if (chroma_siting_vert_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvChromaSitingVert,
-                        chroma_siting_vert)) {
+                        static_cast<uint64>(chroma_siting_vert_))) {
     return false;
   }
-  if (range != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvRange, range)) {
+  if (range_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvRange,
+                        static_cast<uint64>(range_))) {
     return false;
   }
-  if (transfer_characteristics != kValueNotPresent &&
+  if (transfer_characteristics_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvTransferCharacteristics,
-                        transfer_characteristics)) {
+                        static_cast<uint64>(transfer_characteristics_))) {
     return false;
   }
-  if (primaries != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvPrimaries, primaries)) {
+  if (primaries_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvPrimaries,
+                        static_cast<uint64>(primaries_))) {
     return false;
   }
-  if (max_cll != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvMaxCLL, max_cll)) {
+  if (max_cll_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMaxCLL,
+                        static_cast<uint64>(max_cll_))) {
     return false;
   }
-  if (max_fall != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvMaxFALL, max_fall)) {
+  if (max_fall_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMaxFALL,
+                        static_cast<uint64>(max_fall_))) {
     return false;
   }
 
@@ -1103,8 +1237,8 @@ bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
   if (!mm_ptr.get())
     return false;
 
-  mm_ptr->luminance_max = mastering_metadata.luminance_max;
-  mm_ptr->luminance_min = mastering_metadata.luminance_min;
+  mm_ptr->set_luminance_max(mastering_metadata.luminance_max());
+  mm_ptr->set_luminance_min(mastering_metadata.luminance_min());
 
   if (!mm_ptr->SetChromaticity(mastering_metadata.r(), mastering_metadata.g(),
                                mastering_metadata.b(),
@@ -1120,38 +1254,56 @@ bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
 uint64_t Colour::PayloadSize() const {
   uint64_t size = 0;
 
-  if (matrix_coefficients != kValueNotPresent)
-    size +=
-        EbmlElementSize(libwebm::kMkvMatrixCoefficients, matrix_coefficients);
-  if (bits_per_channel != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvBitsPerChannel, bits_per_channel);
-  if (chroma_subsampling_horz != kValueNotPresent)
+  if (matrix_coefficients_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvMatrixCoefficients,
+                            static_cast<uint64>(matrix_coefficients_));
+  }
+  if (bits_per_channel_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvBitsPerChannel,
+                            static_cast<uint64>(bits_per_channel_));
+  }
+  if (chroma_subsampling_horz_ != kValueNotPresent) {
     size += EbmlElementSize(libwebm::kMkvChromaSubsamplingHorz,
-                            chroma_subsampling_horz);
-  if (chroma_subsampling_vert != kValueNotPresent)
+                            static_cast<uint64>(chroma_subsampling_horz_));
+  }
+  if (chroma_subsampling_vert_ != kValueNotPresent) {
     size += EbmlElementSize(libwebm::kMkvChromaSubsamplingVert,
-                            chroma_subsampling_vert);
-  if (cb_subsampling_horz != kValueNotPresent)
-    size +=
-        EbmlElementSize(libwebm::kMkvCbSubsamplingHorz, cb_subsampling_horz);
-  if (cb_subsampling_vert != kValueNotPresent)
-    size +=
-        EbmlElementSize(libwebm::kMkvCbSubsamplingVert, cb_subsampling_vert);
-  if (chroma_siting_horz != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvChromaSitingHorz, chroma_siting_horz);
-  if (chroma_siting_vert != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvChromaSitingVert, chroma_siting_vert);
-  if (range != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvRange, range);
-  if (transfer_characteristics != kValueNotPresent)
+                            static_cast<uint64>(chroma_subsampling_vert_));
+  }
+  if (cb_subsampling_horz_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvCbSubsamplingHorz,
+                            static_cast<uint64>(cb_subsampling_horz_));
+  }
+  if (cb_subsampling_vert_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvCbSubsamplingVert,
+                            static_cast<uint64>(cb_subsampling_vert_));
+  }
+  if (chroma_siting_horz_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSitingHorz,
+                            static_cast<uint64>(chroma_siting_horz_));
+  }
+  if (chroma_siting_vert_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSitingVert,
+                            static_cast<uint64>(chroma_siting_vert_));
+  }
+  if (range_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvRange, static_cast<uint64>(range_));
+  }
+  if (transfer_characteristics_ != kValueNotPresent) {
     size += EbmlElementSize(libwebm::kMkvTransferCharacteristics,
-                            transfer_characteristics);
-  if (primaries != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvPrimaries, primaries);
-  if (max_cll != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvMaxCLL, max_cll);
-  if (max_fall != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvMaxFALL, max_fall);
+                            static_cast<uint64>(transfer_characteristics_));
+  }
+  if (primaries_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvPrimaries,
+                            static_cast<uint64>(primaries_));
+  }
+  if (max_cll_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvMaxCLL, static_cast<uint64>(max_cll_));
+  }
+  if (max_fall_ != kValueNotPresent) {
+    size +=
+        EbmlElementSize(libwebm::kMkvMaxFALL, static_cast<uint64>(max_fall_));
+  }
 
   if (mastering_metadata_)
     size += mastering_metadata_->MasteringMetadataSize();
@@ -1161,12 +1313,103 @@ uint64_t Colour::PayloadSize() const {
 
 ///////////////////////////////////////////////////////////////
 //
+// Projection element
+
+uint64_t Projection::ProjectionSize() const {
+  uint64_t size = PayloadSize();
+
+  if (size > 0)
+    size += EbmlMasterElementSize(libwebm::kMkvProjection, size);
+
+  return size;
+}
+
+bool Projection::Write(IMkvWriter* writer) const {
+  const uint64_t size = PayloadSize();
+
+  // Don't write an empty element.
+  if (size == 0)
+    return true;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvProjection, size))
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionType,
+                        static_cast<uint64>(type_))) {
+    return false;
+  }
+
+  if (private_data_length_ > 0 && private_data_ != NULL &&
+      !WriteEbmlElement(writer, libwebm::kMkvProjectionPrivate, private_data_,
+                        private_data_length_)) {
+    return false;
+  }
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseYaw, pose_yaw_))
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPosePitch,
+                        pose_pitch_)) {
+    return false;
+  }
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseRoll, pose_roll_)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool Projection::SetProjectionPrivate(const uint8_t* data,
+                                      uint64_t data_length) {
+  if (data == NULL || data_length == 0) {
+    return false;
+  }
+
+  if (data_length != static_cast<size_t>(data_length)) {
+    return false;
+  }
+
+  uint8_t* new_private_data =
+      new (std::nothrow) uint8_t[static_cast<size_t>(data_length)];
+  if (new_private_data == NULL) {
+    return false;
+  }
+
+  delete[] private_data_;
+  private_data_ = new_private_data;
+  private_data_length_ = data_length;
+  memcpy(private_data_, data, static_cast<size_t>(data_length));
+
+  return true;
+}
+
+uint64_t Projection::PayloadSize() const {
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvProjection, static_cast<uint64>(type_));
+
+  if (private_data_length_ > 0 && private_data_ != NULL) {
+    size += EbmlElementSize(libwebm::kMkvProjectionPrivate, private_data_,
+                            private_data_length_);
+  }
+
+  size += EbmlElementSize(libwebm::kMkvProjectionPoseYaw, pose_yaw_);
+  size += EbmlElementSize(libwebm::kMkvProjectionPosePitch, pose_pitch_);
+  size += EbmlElementSize(libwebm::kMkvProjectionPoseRoll, pose_roll_);
+
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
 // VideoTrack Class
 
 VideoTrack::VideoTrack(unsigned int* seed)
     : Track(seed),
       display_height_(0),
       display_width_(0),
+      pixel_height_(0),
+      pixel_width_(0),
       crop_left_(0),
       crop_right_(0),
       crop_top_(0),
@@ -1176,9 +1419,13 @@ VideoTrack::VideoTrack(unsigned int* seed)
       stereo_mode_(0),
       alpha_mode_(0),
       width_(0),
-      colour_(NULL) {}
+      colour_(NULL),
+      projection_(NULL) {}
 
-VideoTrack::~VideoTrack() { delete colour_; }
+VideoTrack::~VideoTrack() {
+  delete colour_;
+  delete projection_;
+}
 
 bool VideoTrack::SetStereoMode(uint64_t stereo_mode) {
   if (stereo_mode != kMono && stereo_mode != kSideBySideLeftIsFirst &&
@@ -1221,40 +1468,52 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvPixelWidth, width_))
+  if (!WriteEbmlElement(
+          writer, libwebm::kMkvPixelWidth,
+          static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_)))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvPixelHeight, height_))
+  if (!WriteEbmlElement(
+          writer, libwebm::kMkvPixelHeight,
+          static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_)))
     return false;
   if (display_width_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth, display_width_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth,
+                          static_cast<uint64>(display_width_)))
       return false;
   }
   if (display_height_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight, display_height_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight,
+                          static_cast<uint64>(display_height_)))
       return false;
   }
   if (crop_left_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft, crop_left_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft,
+                          static_cast<uint64>(crop_left_)))
       return false;
   }
   if (crop_right_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight, crop_right_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight,
+                          static_cast<uint64>(crop_right_)))
       return false;
   }
   if (crop_top_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop, crop_top_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop,
+                          static_cast<uint64>(crop_top_)))
       return false;
   }
   if (crop_bottom_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom, crop_bottom_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom,
+                          static_cast<uint64>(crop_bottom_)))
       return false;
   }
   if (stereo_mode_ > kMono) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode, stereo_mode_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode,
+                          static_cast<uint64>(stereo_mode_)))
       return false;
   }
   if (alpha_mode_ > kNoAlpha) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode, alpha_mode_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode,
+                          static_cast<uint64>(alpha_mode_)))
       return false;
   }
   if (frame_rate_ > 0.0) {
@@ -1267,6 +1526,10 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
     if (!colour_->Write(writer))
       return false;
   }
+  if (projection_) {
+    if (!projection_->Write(writer))
+      return false;
+  }
 
   const int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
@@ -1287,47 +1550,83 @@ bool VideoTrack::SetColour(const Colour& colour) {
       return false;
   }
 
-  colour_ptr->matrix_coefficients = colour.matrix_coefficients;
-  colour_ptr->bits_per_channel = colour.bits_per_channel;
-  colour_ptr->chroma_subsampling_horz = colour.chroma_subsampling_horz;
-  colour_ptr->chroma_subsampling_vert = colour.chroma_subsampling_vert;
-  colour_ptr->cb_subsampling_horz = colour.cb_subsampling_horz;
-  colour_ptr->cb_subsampling_vert = colour.cb_subsampling_vert;
-  colour_ptr->chroma_siting_horz = colour.chroma_siting_horz;
-  colour_ptr->chroma_siting_vert = colour.chroma_siting_vert;
-  colour_ptr->range = colour.range;
-  colour_ptr->transfer_characteristics = colour.transfer_characteristics;
-  colour_ptr->primaries = colour.primaries;
-  colour_ptr->max_cll = colour.max_cll;
-  colour_ptr->max_fall = colour.max_fall;
+  colour_ptr->set_matrix_coefficients(colour.matrix_coefficients());
+  colour_ptr->set_bits_per_channel(colour.bits_per_channel());
+  colour_ptr->set_chroma_subsampling_horz(colour.chroma_subsampling_horz());
+  colour_ptr->set_chroma_subsampling_vert(colour.chroma_subsampling_vert());
+  colour_ptr->set_cb_subsampling_horz(colour.cb_subsampling_horz());
+  colour_ptr->set_cb_subsampling_vert(colour.cb_subsampling_vert());
+  colour_ptr->set_chroma_siting_horz(colour.chroma_siting_horz());
+  colour_ptr->set_chroma_siting_vert(colour.chroma_siting_vert());
+  colour_ptr->set_range(colour.range());
+  colour_ptr->set_transfer_characteristics(colour.transfer_characteristics());
+  colour_ptr->set_primaries(colour.primaries());
+  colour_ptr->set_max_cll(colour.max_cll());
+  colour_ptr->set_max_fall(colour.max_fall());
+  delete colour_;
   colour_ = colour_ptr.release();
   return true;
 }
 
+bool VideoTrack::SetProjection(const Projection& projection) {
+  std::auto_ptr<Projection> projection_ptr(new Projection());
+  if (!projection_ptr.get())
+    return false;
+
+  if (projection.private_data()) {
+    if (!projection_ptr->SetProjectionPrivate(
+            projection.private_data(), projection.private_data_length())) {
+      return false;
+    }
+  }
+
+  projection_ptr->set_type(projection.type());
+  projection_ptr->set_pose_yaw(projection.pose_yaw());
+  projection_ptr->set_pose_pitch(projection.pose_pitch());
+  projection_ptr->set_pose_roll(projection.pose_roll());
+  delete projection_;
+  projection_ = projection_ptr.release();
+  return true;
+}
+
 uint64_t VideoTrack::VideoPayloadSize() const {
-  uint64_t size = EbmlElementSize(libwebm::kMkvPixelWidth, width_);
-  size += EbmlElementSize(libwebm::kMkvPixelHeight, height_);
+  uint64_t size = EbmlElementSize(
+      libwebm::kMkvPixelWidth,
+      static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_));
+  size += EbmlElementSize(
+      libwebm::kMkvPixelHeight,
+      static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_));
   if (display_width_ > 0)
-    size += EbmlElementSize(libwebm::kMkvDisplayWidth, display_width_);
+    size += EbmlElementSize(libwebm::kMkvDisplayWidth,
+                            static_cast<uint64>(display_width_));
   if (display_height_ > 0)
-    size += EbmlElementSize(libwebm::kMkvDisplayHeight, display_height_);
+    size += EbmlElementSize(libwebm::kMkvDisplayHeight,
+                            static_cast<uint64>(display_height_));
   if (crop_left_ > 0)
-    size += EbmlElementSize(libwebm::kMkvPixelCropLeft, crop_left_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropLeft,
+                            static_cast<uint64>(crop_left_));
   if (crop_right_ > 0)
-    size += EbmlElementSize(libwebm::kMkvPixelCropRight, crop_right_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropRight,
+                            static_cast<uint64>(crop_right_));
   if (crop_top_ > 0)
-    size += EbmlElementSize(libwebm::kMkvPixelCropTop, crop_top_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropTop,
+                            static_cast<uint64>(crop_top_));
   if (crop_bottom_ > 0)
-    size += EbmlElementSize(libwebm::kMkvPixelCropBottom, crop_bottom_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropBottom,
+                            static_cast<uint64>(crop_bottom_));
   if (stereo_mode_ > kMono)
-    size += EbmlElementSize(libwebm::kMkvStereoMode, stereo_mode_);
+    size += EbmlElementSize(libwebm::kMkvStereoMode,
+                            static_cast<uint64>(stereo_mode_));
   if (alpha_mode_ > kNoAlpha)
-    size += EbmlElementSize(libwebm::kMkvAlphaMode, alpha_mode_);
+    size += EbmlElementSize(libwebm::kMkvAlphaMode,
+                            static_cast<uint64>(alpha_mode_));
   if (frame_rate_ > 0.0)
     size += EbmlElementSize(libwebm::kMkvFrameRate,
                             static_cast<float>(frame_rate_));
   if (colour_)
     size += colour_->ColourSize();
+  if (projection_)
+    size += projection_->ProjectionSize();
 
   return size;
 }
@@ -1346,9 +1645,11 @@ uint64_t AudioTrack::PayloadSize() const {
 
   uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
                                   static_cast<float>(sample_rate_));
-  size += EbmlElementSize(libwebm::kMkvChannels, channels_);
+  size +=
+      EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_));
   if (bit_depth_ > 0)
-    size += EbmlElementSize(libwebm::kMkvBitDepth, bit_depth_);
+    size +=
+        EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_));
   size += EbmlMasterElementSize(libwebm::kMkvAudio, size);
 
   return parent_size + size;
@@ -1361,9 +1662,11 @@ bool AudioTrack::Write(IMkvWriter* writer) const {
   // Calculate AudioSettings size.
   uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
                                   static_cast<float>(sample_rate_));
-  size += EbmlElementSize(libwebm::kMkvChannels, channels_);
+  size +=
+      EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_));
   if (bit_depth_ > 0)
-    size += EbmlElementSize(libwebm::kMkvBitDepth, bit_depth_);
+    size +=
+        EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_));
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvAudio, size))
     return false;
@@ -1375,10 +1678,12 @@ bool AudioTrack::Write(IMkvWriter* writer) const {
   if (!WriteEbmlElement(writer, libwebm::kMkvSamplingFrequency,
                         static_cast<float>(sample_rate_)))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvChannels, channels_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChannels,
+                        static_cast<uint64>(channels_)))
     return false;
   if (bit_depth_ > 0)
-    if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth, bit_depth_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth,
+                          static_cast<uint64>(bit_depth_)))
       return false;
 
   const int64_t stop_position = writer->Position();
@@ -1398,6 +1703,10 @@ const char Tracks::kVorbisCodecId[] = "A_VORBIS";
 const char Tracks::kVp8CodecId[] = "V_VP8";
 const char Tracks::kVp9CodecId[] = "V_VP9";
 const char Tracks::kVp10CodecId[] = "V_VP10";
+const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS";
+const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS";
+const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA";
+const char Tracks::kWebVttSubtitlesId[] = "D_WEBVTT/SUBTITLES";
 
 Tracks::Tracks()
     : track_entries_(NULL), track_entries_size_(0), wrote_tracks_(false) {}
@@ -1650,9 +1959,11 @@ bool Chapter::ExpandDisplaysArray() {
 uint64_t Chapter::WriteAtom(IMkvWriter* writer) const {
   uint64_t payload_size =
       EbmlElementSize(libwebm::kMkvChapterStringUID, id_) +
-      EbmlElementSize(libwebm::kMkvChapterUID, uid_) +
-      EbmlElementSize(libwebm::kMkvChapterTimeStart, start_timecode_) +
-      EbmlElementSize(libwebm::kMkvChapterTimeEnd, end_timecode_);
+      EbmlElementSize(libwebm::kMkvChapterUID, static_cast<uint64>(uid_)) +
+      EbmlElementSize(libwebm::kMkvChapterTimeStart,
+                      static_cast<uint64>(start_timecode_)) +
+      EbmlElementSize(libwebm::kMkvChapterTimeEnd,
+                      static_cast<uint64>(end_timecode_));
 
   for (int idx = 0; idx < displays_count_; ++idx) {
     const Display& d = displays_[idx];
@@ -1674,13 +1985,16 @@ uint64_t Chapter::WriteAtom(IMkvWriter* writer) const {
   if (!WriteEbmlElement(writer, libwebm::kMkvChapterStringUID, id_))
     return 0;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID, uid_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID,
+                        static_cast<uint64>(uid_)))
     return 0;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart, start_timecode_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart,
+                        static_cast<uint64>(start_timecode_)))
     return 0;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd, end_timecode_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd,
+                        static_cast<uint64>(end_timecode_)))
     return 0;
 
   for (int idx = 0; idx < displays_count_; ++idx) {
@@ -2125,7 +2439,17 @@ Cluster::Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale,
       write_last_frame_with_duration_(write_last_frame_with_duration),
       writer_(NULL) {}
 
-Cluster::~Cluster() {}
+Cluster::~Cluster() {
+  // Delete any stored frames that are left behind. This will happen if the
+  // Cluster was not Finalized for whatever reason.
+  while (!stored_frames_.empty()) {
+    while (!stored_frames_.begin()->second.empty()) {
+      delete stored_frames_.begin()->second.front();
+      stored_frames_.begin()->second.pop_front();
+    }
+    stored_frames_.erase(stored_frames_.begin()->first);
+  }
+}
 
 bool Cluster::Init(IMkvWriter* ptr_writer) {
   if (!ptr_writer) {
@@ -2421,10 +2745,10 @@ bool SeekHead::Finalize(IMkvWriter* writer) const {
 
     for (int32_t i = 0; i < kSeekEntryCount; ++i) {
       if (seek_entry_id_[i] != 0) {
-        entry_size[i] = EbmlElementSize(
-            libwebm::kMkvSeekID, static_cast<uint64_t>(seek_entry_id_[i]));
-        entry_size[i] +=
-            EbmlElementSize(libwebm::kMkvSeekPosition, seek_entry_pos_[i]);
+        entry_size[i] = EbmlElementSize(libwebm::kMkvSeekID,
+                                        static_cast<uint64>(seek_entry_id_[i]));
+        entry_size[i] += EbmlElementSize(
+            libwebm::kMkvSeekPosition, static_cast<uint64>(seek_entry_pos_[i]));
 
         payload_size +=
             EbmlMasterElementSize(libwebm::kMkvSeek, entry_size[i]) +
@@ -2449,11 +2773,11 @@ bool SeekHead::Finalize(IMkvWriter* writer) const {
           return false;
 
         if (!WriteEbmlElement(writer, libwebm::kMkvSeekID,
-                              static_cast<uint64_t>(seek_entry_id_[i])))
+                              static_cast<uint64>(seek_entry_id_[i])))
           return false;
 
         if (!WriteEbmlElement(writer, libwebm::kMkvSeekPosition,
-                              seek_entry_pos_[i]))
+                              static_cast<uint64>(seek_entry_pos_[i])))
           return false;
       }
     }
@@ -2522,8 +2846,10 @@ bool SeekHead::SetSeekEntry(int index, uint32_t id, uint64_t position) {
 
 uint64_t SeekHead::MaxEntrySize() const {
   const uint64_t max_entry_payload_size =
-      EbmlElementSize(libwebm::kMkvSeekID, UINT64_C(0xffffffff)) +
-      EbmlElementSize(libwebm::kMkvSeekPosition, UINT64_C(0xffffffffffffffff));
+      EbmlElementSize(libwebm::kMkvSeekID,
+                      static_cast<uint64>(UINT64_C(0xffffffff))) +
+      EbmlElementSize(libwebm::kMkvSeekPosition,
+                      static_cast<uint64>(UINT64_C(0xffffffffffffffff)));
   const uint64_t max_entry_size =
       EbmlMasterElementSize(libwebm::kMkvSeek, max_entry_payload_size) +
       max_entry_payload_size;
@@ -2613,7 +2939,8 @@ bool SegmentInfo::Write(IMkvWriter* writer) {
   if (!writer || !muxing_app_ || !writing_app_)
     return false;
 
-  uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale, timecode_scale_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale,
+                                  static_cast<uint64>(timecode_scale_));
   if (duration_ > 0.0)
     size +=
         EbmlElementSize(libwebm::kMkvDuration, static_cast<float>(duration_));
@@ -2629,7 +2956,8 @@ bool SegmentInfo::Write(IMkvWriter* writer) {
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale, timecode_scale_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale,
+                        static_cast<uint64>(timecode_scale_)))
     return false;
 
   if (duration_ > 0.0) {
@@ -2725,10 +3053,12 @@ Segment::Segment()
       output_cues_(true),
       accurate_cluster_duration_(false),
       fixed_size_cluster_timecode_(false),
+      estimate_file_duration_(true),
       payload_pos_(0),
       size_position_(0),
       doc_type_version_(kDefaultDocTypeVersion),
       doc_type_version_written_(0),
+      duration_(0.0),
       writer_cluster_(NULL),
       writer_cues_(NULL),
       writer_header_(NULL) {
@@ -2833,6 +3163,10 @@ bool Segment::Init(IMkvWriter* ptr_writer) {
   writer_cluster_ = ptr_writer;
   writer_cues_ = ptr_writer;
   writer_header_ = ptr_writer;
+  memset(&track_frames_written_, 0,
+         sizeof(track_frames_written_[0]) * kMaxTrackNumber);
+  memset(&last_track_timestamp_, 0,
+         sizeof(last_track_timestamp_[0]) * kMaxTrackNumber);
   return segment_info_.Init();
 }
 
@@ -2876,7 +3210,10 @@ bool Segment::Finalize() {
   if (WriteFramesAll() < 0)
     return false;
 
-  if (cluster_list_size_ > 0) {
+  // In kLive mode, call Cluster::Finalize only if |accurate_cluster_duration_|
+  // is set. In all other modes, always call Cluster::Finalize.
+  if ((mode_ == kLive ? accurate_cluster_duration_ : true) &&
+      cluster_list_size_ > 0) {
     // Update last cluster's size
     Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
 
@@ -2892,9 +3229,30 @@ bool Segment::Finalize() {
       chunk_count_++;
     }
 
-    const double duration =
+    double duration =
         (static_cast<double>(last_timestamp_) + last_block_duration_) /
         segment_info_.timecode_scale();
+    if (duration_ > 0.0) {
+      duration = duration_;
+    } else {
+      if (last_block_duration_ == 0 && estimate_file_duration_) {
+        const int num_tracks = static_cast<int>(tracks_.track_entries_size());
+        for (int i = 0; i < num_tracks; ++i) {
+          if (track_frames_written_[i] < 2)
+            continue;
+
+          // Estimate the duration for the last block of a Track.
+          const double nano_per_frame =
+              static_cast<double>(last_track_timestamp_[i]) /
+              (track_frames_written_[i] - 1);
+          const double track_duration =
+              (last_track_timestamp_[i] + nano_per_frame) /
+              segment_info_.timecode_scale();
+          if (track_duration > duration)
+            duration = track_duration;
+        }
+      }
+    }
     segment_info_.set_duration(duration);
     if (!segment_info_.Finalize(writer_header_))
       return false;
@@ -2941,7 +3299,9 @@ bool Segment::Finalize() {
         if (writer_header_->Position(0))
           return false;
 
-        if (!WriteEbmlHeader(writer_header_, doc_type_version_))
+        const char* const doc_type =
+            DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska;
+        if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type))
           return false;
         if (writer_header_->Position() != ebml_header_size_)
           return false;
@@ -3138,7 +3498,10 @@ bool Segment::AddGenericFrame(const Frame* frame) {
     Frame* const new_frame = new (std::nothrow) Frame();
     if (!new_frame || !new_frame->CopyFrom(*frame))
       return false;
-    return QueueFrame(new_frame);
+    if (!QueueFrame(new_frame))
+      return false;
+    track_frames_written_[frame->track_number() - 1]++;
+    return true;
   }
 
   if (!DoNewClusterProcessing(frame->track_number(), frame->timestamp(),
@@ -3178,10 +3541,10 @@ bool Segment::AddGenericFrame(const Frame* frame) {
   last_timestamp_ = frame->timestamp();
   last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
   last_block_duration_ = frame->duration();
+  track_frames_written_[frame->track_number() - 1]++;
 
   if (frame_created)
     delete frame;
-
   return true;
 }
 
@@ -3292,8 +3655,9 @@ Track* Segment::GetTrackByNumber(uint64_t track_number) const {
 bool Segment::WriteSegmentHeader() {
   UpdateDocTypeVersion();
 
-  // TODO(fgalligan): Support more than one segment.
-  if (!WriteEbmlHeader(writer_header_, doc_type_version_))
+  const char* const doc_type =
+      DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska;
+  if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type))
     return false;
   doc_type_version_written_ = doc_type_version_;
   ebml_header_size_ = static_cast<int32_t>(writer_header_->Position());
@@ -3766,4 +4130,35 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) {
   return true;
 }
 
+bool Segment::DocTypeIsWebm() const {
+  const int kNumCodecIds = 9;
+
+  // TODO(vigneshv): Tweak .clang-format.
+  const char* kWebmCodecIds[kNumCodecIds] = {
+      Tracks::kOpusCodecId,          Tracks::kVorbisCodecId,
+      Tracks::kVp8CodecId,           Tracks::kVp9CodecId,
+      Tracks::kVp10CodecId,          Tracks::kWebVttCaptionsId,
+      Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId,
+      Tracks::kWebVttSubtitlesId};
+
+  const int num_tracks = static_cast<int>(tracks_.track_entries_size());
+  for (int track_index = 0; track_index < num_tracks; ++track_index) {
+    const Track* const track = tracks_.GetTrackByIndex(track_index);
+    const std::string codec_id = track->codec_id();
+
+    bool id_is_webm = false;
+    for (int id_index = 0; id_index < kNumCodecIds; ++id_index) {
+      if (codec_id == kWebmCodecIds[id_index]) {
+        id_is_webm = true;
+        break;
+      }
+    }
+
+    if (!id_is_webm)
+      return false;
+  }
+
+  return true;
+}
+
 }  // namespace mkvmuxer
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
index 55ba07196df..46b0029dc47 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -64,6 +64,12 @@ class IMkvWriter {
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(IMkvWriter);
 };
 
+// Writes out the EBML header for a WebM file, but allows caller to specify
+// DocType. This function must be called before any other libwebm writing
+// functions are called.
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version,
+                     const char* const doc_type);
+
 // Writes out the EBML header for a WebM file. This function must be called
 // before any other libwebm writing functions are called.
 bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version);
@@ -348,26 +354,42 @@ class ContentEncoding {
 
 ///////////////////////////////////////////////////////////////
 // Colour element.
-struct PrimaryChromaticity {
-  PrimaryChromaticity(float x_val, float y_val) : x(x_val), y(y_val) {}
-  PrimaryChromaticity() : x(0), y(0) {}
+class PrimaryChromaticity {
+ public:
+  static const float kChromaticityMin;
+  static const float kChromaticityMax;
+
+  PrimaryChromaticity(float x_val, float y_val) : x_(x_val), y_(y_val) {}
+  PrimaryChromaticity() : x_(0), y_(0) {}
   ~PrimaryChromaticity() {}
-  uint64_t PrimaryChromaticityPayloadSize(libwebm::MkvId x_id,
-                                          libwebm::MkvId y_id) const;
+
+  // Returns sum of |x_id| and |y_id| element id sizes and payload sizes.
+  uint64_t PrimaryChromaticitySize(libwebm::MkvId x_id,
+                                   libwebm::MkvId y_id) const;
+  bool Valid() const;
   bool Write(IMkvWriter* writer, libwebm::MkvId x_id,
              libwebm::MkvId y_id) const;
 
-  float x;
-  float y;
+  float x() const { return x_; }
+  void set_x(float new_x) { x_ = new_x; }
+  float y() const { return y_; }
+  void set_y(float new_y) { y_ = new_y; }
+
+ private:
+  float x_;
+  float y_;
 };
 
 class MasteringMetadata {
  public:
   static const float kValueNotPresent;
+  static const float kMinLuminance;
+  static const float kMinLuminanceMax;
+  static const float kMaxLuminanceMax;
 
   MasteringMetadata()
-      : luminance_max(kValueNotPresent),
-        luminance_min(kValueNotPresent),
+      : luminance_max_(kValueNotPresent),
+        luminance_min_(kValueNotPresent),
         r_(NULL),
         g_(NULL),
         b_(NULL),
@@ -381,6 +403,7 @@ class MasteringMetadata {
 
   // Returns total size of the MasteringMetadata element.
   uint64_t MasteringMetadataSize() const;
+  bool Valid() const;
   bool Write(IMkvWriter* writer) const;
 
   // Copies non-null chromaticity.
@@ -393,13 +416,21 @@ class MasteringMetadata {
   const PrimaryChromaticity* b() const { return b_; }
   const PrimaryChromaticity* white_point() const { return white_point_; }
 
-  float luminance_max;
-  float luminance_min;
+  float luminance_max() const { return luminance_max_; }
+  void set_luminance_max(float luminance_max) {
+    luminance_max_ = luminance_max;
+  }
+  float luminance_min() const { return luminance_min_; }
+  void set_luminance_min(float luminance_min) {
+    luminance_min_ = luminance_min;
+  }
 
  private:
   // Returns size of MasteringMetadata child elements.
   uint64_t PayloadSize() const;
 
+  float luminance_max_;
+  float luminance_min_;
   PrimaryChromaticity* r_;
   PrimaryChromaticity* g_;
   PrimaryChromaticity* b_;
@@ -408,26 +439,90 @@ class MasteringMetadata {
 
 class Colour {
  public:
+  enum MatrixCoefficients {
+    kGbr = 0,
+    kBt709 = 1,
+    kUnspecifiedMc = 2,
+    kReserved = 3,
+    kFcc = 4,
+    kBt470bg = 5,
+    kSmpte170MMc = 6,
+    kSmpte240MMc = 7,
+    kYcocg = 8,
+    kBt2020NonConstantLuminance = 9,
+    kBt2020ConstantLuminance = 10,
+  };
+  enum ChromaSitingHorz {
+    kUnspecifiedCsh = 0,
+    kLeftCollocated = 1,
+    kHalfCsh = 2,
+  };
+  enum ChromaSitingVert {
+    kUnspecifiedCsv = 0,
+    kTopCollocated = 1,
+    kHalfCsv = 2,
+  };
+  enum Range {
+    kUnspecifiedCr = 0,
+    kBroadcastRange = 1,
+    kFullRange = 2,
+    kMcTcDefined = 3,  // Defined by MatrixCoefficients/TransferCharacteristics.
+  };
+  enum TransferCharacteristics {
+    kIturBt709Tc = 1,
+    kUnspecifiedTc = 2,
+    kReservedTc = 3,
+    kGamma22Curve = 4,
+    kGamma28Curve = 5,
+    kSmpte170MTc = 6,
+    kSmpte240MTc = 7,
+    kLinear = 8,
+    kLog = 9,
+    kLogSqrt = 10,
+    kIec6196624 = 11,
+    kIturBt1361ExtendedColourGamut = 12,
+    kIec6196621 = 13,
+    kIturBt202010bit = 14,
+    kIturBt202012bit = 15,
+    kSmpteSt2084 = 16,
+    kSmpteSt4281Tc = 17,
+    kAribStdB67Hlg = 18,
+  };
+  enum Primaries {
+    kReservedP0 = 0,
+    kIturBt709P = 1,
+    kUnspecifiedP = 2,
+    kReservedP3 = 3,
+    kIturBt470M = 4,
+    kIturBt470Bg = 5,
+    kSmpte170MP = 6,
+    kSmpte240MP = 7,
+    kFilm = 8,
+    kIturBt2020 = 9,
+    kSmpteSt4281P = 10,
+    kJedecP22Phosphors = 22,
+  };
   static const uint64_t kValueNotPresent;
   Colour()
-      : matrix_coefficients(kValueNotPresent),
-        bits_per_channel(kValueNotPresent),
-        chroma_subsampling_horz(kValueNotPresent),
-        chroma_subsampling_vert(kValueNotPresent),
-        cb_subsampling_horz(kValueNotPresent),
-        cb_subsampling_vert(kValueNotPresent),
-        chroma_siting_horz(kValueNotPresent),
-        chroma_siting_vert(kValueNotPresent),
-        range(kValueNotPresent),
-        transfer_characteristics(kValueNotPresent),
-        primaries(kValueNotPresent),
-        max_cll(kValueNotPresent),
-        max_fall(kValueNotPresent),
+      : matrix_coefficients_(kValueNotPresent),
+        bits_per_channel_(kValueNotPresent),
+        chroma_subsampling_horz_(kValueNotPresent),
+        chroma_subsampling_vert_(kValueNotPresent),
+        cb_subsampling_horz_(kValueNotPresent),
+        cb_subsampling_vert_(kValueNotPresent),
+        chroma_siting_horz_(kValueNotPresent),
+        chroma_siting_vert_(kValueNotPresent),
+        range_(kValueNotPresent),
+        transfer_characteristics_(kValueNotPresent),
+        primaries_(kValueNotPresent),
+        max_cll_(kValueNotPresent),
+        max_fall_(kValueNotPresent),
         mastering_metadata_(NULL) {}
   ~Colour() { delete mastering_metadata_; }
 
   // Returns total size of the Colour element.
   uint64_t ColourSize() const;
+  bool Valid() const;
   bool Write(IMkvWriter* writer) const;
 
   // Deep copies |mastering_metadata|.
@@ -437,28 +532,125 @@ class Colour {
     return mastering_metadata_;
   }
 
-  uint64_t matrix_coefficients;
-  uint64_t bits_per_channel;
-  uint64_t chroma_subsampling_horz;
-  uint64_t chroma_subsampling_vert;
-  uint64_t cb_subsampling_horz;
-  uint64_t cb_subsampling_vert;
-  uint64_t chroma_siting_horz;
-  uint64_t chroma_siting_vert;
-  uint64_t range;
-  uint64_t transfer_characteristics;
-  uint64_t primaries;
-  uint64_t max_cll;
-  uint64_t max_fall;
+  uint64_t matrix_coefficients() const { return matrix_coefficients_; }
+  void set_matrix_coefficients(uint64_t matrix_coefficients) {
+    matrix_coefficients_ = matrix_coefficients;
+  }
+  uint64_t bits_per_channel() const { return bits_per_channel_; }
+  void set_bits_per_channel(uint64_t bits_per_channel) {
+    bits_per_channel_ = bits_per_channel;
+  }
+  uint64_t chroma_subsampling_horz() const { return chroma_subsampling_horz_; }
+  void set_chroma_subsampling_horz(uint64_t chroma_subsampling_horz) {
+    chroma_subsampling_horz_ = chroma_subsampling_horz;
+  }
+  uint64_t chroma_subsampling_vert() const { return chroma_subsampling_vert_; }
+  void set_chroma_subsampling_vert(uint64_t chroma_subsampling_vert) {
+    chroma_subsampling_vert_ = chroma_subsampling_vert;
+  }
+  uint64_t cb_subsampling_horz() const { return cb_subsampling_horz_; }
+  void set_cb_subsampling_horz(uint64_t cb_subsampling_horz) {
+    cb_subsampling_horz_ = cb_subsampling_horz;
+  }
+  uint64_t cb_subsampling_vert() const { return cb_subsampling_vert_; }
+  void set_cb_subsampling_vert(uint64_t cb_subsampling_vert) {
+    cb_subsampling_vert_ = cb_subsampling_vert;
+  }
+  uint64_t chroma_siting_horz() const { return chroma_siting_horz_; }
+  void set_chroma_siting_horz(uint64_t chroma_siting_horz) {
+    chroma_siting_horz_ = chroma_siting_horz;
+  }
+  uint64_t chroma_siting_vert() const { return chroma_siting_vert_; }
+  void set_chroma_siting_vert(uint64_t chroma_siting_vert) {
+    chroma_siting_vert_ = chroma_siting_vert;
+  }
+  uint64_t range() const { return range_; }
+  void set_range(uint64_t range) { range_ = range; }
+  uint64_t transfer_characteristics() const {
+    return transfer_characteristics_;
+  }
+  void set_transfer_characteristics(uint64_t transfer_characteristics) {
+    transfer_characteristics_ = transfer_characteristics;
+  }
+  uint64_t primaries() const { return primaries_; }
+  void set_primaries(uint64_t primaries) { primaries_ = primaries; }
+  uint64_t max_cll() const { return max_cll_; }
+  void set_max_cll(uint64_t max_cll) { max_cll_ = max_cll; }
+  uint64_t max_fall() const { return max_fall_; }
+  void set_max_fall(uint64_t max_fall) { max_fall_ = max_fall; }
 
  private:
   // Returns size of Colour child elements.
   uint64_t PayloadSize() const;
 
+  uint64_t matrix_coefficients_;
+  uint64_t bits_per_channel_;
+  uint64_t chroma_subsampling_horz_;
+  uint64_t chroma_subsampling_vert_;
+  uint64_t cb_subsampling_horz_;
+  uint64_t cb_subsampling_vert_;
+  uint64_t chroma_siting_horz_;
+  uint64_t chroma_siting_vert_;
+  uint64_t range_;
+  uint64_t transfer_characteristics_;
+  uint64_t primaries_;
+  uint64_t max_cll_;
+  uint64_t max_fall_;
+
   MasteringMetadata* mastering_metadata_;
 };
 
 ///////////////////////////////////////////////////////////////
+// Projection element.
+class Projection {
+ public:
+  enum ProjectionType {
+    kTypeNotPresent = -1,
+    kRectangular = 0,
+    kEquirectangular = 1,
+    kCubeMap = 2,
+    kMesh = 3,
+  };
+  static const uint64_t kValueNotPresent;
+  Projection()
+      : type_(kRectangular),
+        pose_yaw_(0.0),
+        pose_pitch_(0.0),
+        pose_roll_(0.0),
+        private_data_(NULL),
+        private_data_length_(0) {}
+  ~Projection() { delete[] private_data_; }
+
+  uint64_t ProjectionSize() const;
+  bool Write(IMkvWriter* writer) const;
+
+  bool SetProjectionPrivate(const uint8_t* private_data,
+                            uint64_t private_data_length);
+
+  ProjectionType type() const { return type_; }
+  void set_type(ProjectionType type) { type_ = type; }
+  float pose_yaw() const { return pose_yaw_; }
+  void set_pose_yaw(float pose_yaw) { pose_yaw_ = pose_yaw; }
+  float pose_pitch() const { return pose_pitch_; }
+  void set_pose_pitch(float pose_pitch) { pose_pitch_ = pose_pitch; }
+  float pose_roll() const { return pose_roll_; }
+  void set_pose_roll(float pose_roll) { pose_roll_ = pose_roll; }
+  uint8_t* private_data() const { return private_data_; }
+  uint64_t private_data_length() const { return private_data_length_; }
+
+ private:
+  // Returns size of VideoProjection child elements.
+  uint64_t PayloadSize() const;
+
+  ProjectionType type_;
+  float pose_yaw_;
+  float pose_pitch_;
+  float pose_roll_;
+  uint8_t* private_data_;
+  uint64_t private_data_length_;
+};
+
+///////////////////////////////////////////////////////////////
 // Track element.
 class Track {
  public:
@@ -581,6 +773,10 @@ class VideoTrack : public Track {
   uint64_t display_height() const { return display_height_; }
   void set_display_width(uint64_t width) { display_width_ = width; }
   uint64_t display_width() const { return display_width_; }
+  void set_pixel_height(uint64_t height) { pixel_height_ = height; }
+  uint64_t pixel_height() const { return pixel_height_; }
+  void set_pixel_width(uint64_t width) { pixel_width_ = width; }
+  uint64_t pixel_width() const { return pixel_width_; }
 
   void set_crop_left(uint64_t crop_left) { crop_left_ = crop_left; }
   uint64_t crop_left() const { return crop_left_; }
@@ -605,6 +801,11 @@ class VideoTrack : public Track {
   // Deep copies |colour|.
   bool SetColour(const Colour& colour);
 
+  Projection* projection() { return projection_; }
+
+  // Deep copies |projection|.
+  bool SetProjection(const Projection& projection);
+
  private:
   // Returns the size in bytes of the Video element.
   uint64_t VideoPayloadSize() const;
@@ -612,6 +813,8 @@ class VideoTrack : public Track {
   // Video track element names.
   uint64_t display_height_;
   uint64_t display_width_;
+  uint64_t pixel_height_;
+  uint64_t pixel_width_;
   uint64_t crop_left_;
   uint64_t crop_right_;
   uint64_t crop_top_;
@@ -623,6 +826,7 @@ class VideoTrack : public Track {
   uint64_t width_;
 
   Colour* colour_;
+  Projection* projection_;
 
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(VideoTrack);
 };
@@ -670,6 +874,10 @@ class Tracks {
   static const char kVp8CodecId[];
   static const char kVp9CodecId[];
   static const char kVp10CodecId[];
+  static const char kWebVttCaptionsId[];
+  static const char kWebVttDescriptionsId[];
+  static const char kWebVttMetadataId[];
+  static const char kWebVttSubtitlesId[];
 
   Tracks();
   ~Tracks();
@@ -1294,8 +1502,8 @@ class Segment {
     kBeforeClusters = 0x1  // Position Cues before Clusters
   };
 
-  const static uint32_t kDefaultDocTypeVersion = 2;
-  const static uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
+  static const uint32_t kDefaultDocTypeVersion = 4;
+  static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
 
   Segment();
   ~Segment();
@@ -1481,7 +1689,16 @@ class Segment {
   Mode mode() const { return mode_; }
   CuesPosition cues_position() const { return cues_position_; }
   bool output_cues() const { return output_cues_; }
+  void set_estimate_file_duration(bool estimate_duration) {
+    estimate_file_duration_ = estimate_duration;
+  }
+  bool estimate_file_duration() const { return estimate_file_duration_; }
   const SegmentInfo* segment_info() const { return &segment_info_; }
+  void set_duration(double duration) { duration_ = duration; }
+  double duration() const { return duration_; }
+
+  // Returns true when codec IDs are valid for WebM.
+  bool DocTypeIsWebm() const;
 
  private:
   // Checks if header information has been output and initialized. If not it
@@ -1637,6 +1854,9 @@ class Segment {
   // Last timestamp in nanoseconds by track number added to a cluster.
   uint64_t last_track_timestamp_[kMaxTrackNumber];
 
+  // Number of frames written per track.
+  uint64_t track_frames_written_[kMaxTrackNumber];
+
   // Maximum time in nanoseconds for a cluster duration. This variable is a
   // guideline and some clusters may have a longer duration. Default is 30
   // seconds.
@@ -1665,6 +1885,9 @@ class Segment {
   // Flag whether or not to write the Cluster Timecode using exactly 8 bytes.
   bool fixed_size_cluster_timecode_;
 
+  // Flag whether or not to estimate the file duration.
+  bool estimate_file_duration_;
+
   // The size of the EBML header, used to validate the header if
   // WriteEbmlHeader() is called more than once.
   int32_t ebml_header_size_;
@@ -1682,6 +1905,9 @@ class Segment {
   uint32_t doc_type_version_;
   uint32_t doc_type_version_written_;
 
+  // If |duration_| is > 0, then explicitly set the duration of the segment.
+  double duration_;
+
   // Pointer to the writer objects. Not owned by this class.
   IMkvWriter* writer_cluster_;
   IMkvWriter* writer_cues_;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 3562b8ab828..1ba17ac1ba0 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -31,20 +31,20 @@ namespace {
 // Date elements are always 8 octets in size.
 const int kDateElementSize = 8;
 
-uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
-                    int64_t timecode, uint64_t timecode_scale) {
-  uint64_t block_additional_elem_size = 0;
-  uint64_t block_addid_elem_size = 0;
-  uint64_t block_more_payload_size = 0;
-  uint64_t block_more_elem_size = 0;
-  uint64_t block_additions_payload_size = 0;
-  uint64_t block_additions_elem_size = 0;
+uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
+                  uint64 timecode_scale) {
+  uint64 block_additional_elem_size = 0;
+  uint64 block_addid_elem_size = 0;
+  uint64 block_more_payload_size = 0;
+  uint64 block_more_elem_size = 0;
+  uint64 block_additions_payload_size = 0;
+  uint64 block_additions_elem_size = 0;
   if (frame->additional()) {
     block_additional_elem_size =
         EbmlElementSize(libwebm::kMkvBlockAdditional, frame->additional(),
                         frame->additional_length());
-    block_addid_elem_size =
-        EbmlElementSize(libwebm::kMkvBlockAddID, frame->add_id());
+    block_addid_elem_size = EbmlElementSize(
+        libwebm::kMkvBlockAddID, static_cast<uint64>(frame->add_id()));
 
     block_more_payload_size =
         block_addid_elem_size + block_additional_elem_size;
@@ -58,32 +58,33 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
         block_additions_payload_size;
   }
 
-  uint64_t discard_padding_elem_size = 0;
+  uint64 discard_padding_elem_size = 0;
   if (frame->discard_padding() != 0) {
     discard_padding_elem_size =
-        EbmlElementSize(libwebm::kMkvDiscardPadding, frame->discard_padding());
+        EbmlElementSize(libwebm::kMkvDiscardPadding,
+                        static_cast<int64>(frame->discard_padding()));
   }
 
-  const uint64_t reference_block_timestamp =
+  const uint64 reference_block_timestamp =
       frame->reference_block_timestamp() / timecode_scale;
-  uint64_t reference_block_elem_size = 0;
+  uint64 reference_block_elem_size = 0;
   if (!frame->is_key()) {
     reference_block_elem_size =
         EbmlElementSize(libwebm::kMkvReferenceBlock, reference_block_timestamp);
   }
 
-  const uint64_t duration = frame->duration() / timecode_scale;
-  uint64_t block_duration_elem_size = 0;
+  const uint64 duration = frame->duration() / timecode_scale;
+  uint64 block_duration_elem_size = 0;
   if (duration > 0)
     block_duration_elem_size =
         EbmlElementSize(libwebm::kMkvBlockDuration, duration);
 
-  const uint64_t block_payload_size = 4 + frame->length();
-  const uint64_t block_elem_size =
+  const uint64 block_payload_size = 4 + frame->length();
+  const uint64 block_elem_size =
       EbmlMasterElementSize(libwebm::kMkvBlock, block_payload_size) +
       block_payload_size;
 
-  const uint64_t block_group_payload_size =
+  const uint64 block_group_payload_size =
       block_elem_size + block_additions_elem_size + block_duration_elem_size +
       discard_padding_elem_size + reference_block_elem_size;
 
@@ -105,7 +106,7 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
   if (SerializeInt(writer, 0, 1))
     return 0;
 
-  if (writer->Write(frame->frame(), static_cast<uint32_t>(frame->length())))
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
     return 0;
 
   if (frame->additional()) {
@@ -118,7 +119,8 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
                                 block_more_payload_size))
       return 0;
 
-    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID, frame->add_id()))
+    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID,
+                          static_cast<uint64>(frame->add_id())))
       return 0;
 
     if (!WriteEbmlElement(writer, libwebm::kMkvBlockAdditional,
@@ -129,7 +131,7 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
 
   if (frame->discard_padding() != 0 &&
       !WriteEbmlElement(writer, libwebm::kMkvDiscardPadding,
-                        frame->discard_padding())) {
+                        static_cast<int64>(frame->discard_padding()))) {
     return false;
   }
 
@@ -148,38 +150,38 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
          block_group_payload_size;
 }
 
-uint64_t WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
-                          int64_t timecode) {
+uint64 WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
+                        int64 timecode) {
   if (WriteID(writer, libwebm::kMkvSimpleBlock))
     return 0;
 
-  const int32_t size = static_cast<int32_t>(frame->length()) + 4;
+  const int32 size = static_cast<int32>(frame->length()) + 4;
   if (WriteUInt(writer, size))
     return 0;
 
-  if (WriteUInt(writer, static_cast<uint64_t>(frame->track_number())))
+  if (WriteUInt(writer, static_cast<uint64>(frame->track_number())))
     return 0;
 
   if (SerializeInt(writer, timecode, 2))
     return 0;
 
-  uint64_t flags = 0;
+  uint64 flags = 0;
   if (frame->is_key())
     flags |= 0x80;
 
   if (SerializeInt(writer, flags, 1))
     return 0;
 
-  if (writer->Write(frame->frame(), static_cast<uint32_t>(frame->length())))
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
     return 0;
 
-  return static_cast<uint64_t>(GetUIntSize(libwebm::kMkvSimpleBlock) +
-                               GetCodedUIntSize(size) + 4 + frame->length());
+  return GetUIntSize(libwebm::kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 +
+         frame->length();
 }
 
 }  // namespace
 
-int32_t GetCodedUIntSize(uint64_t value) {
+int32 GetCodedUIntSize(uint64 value) {
   if (value < 0x000000000000007FULL)
     return 1;
   else if (value < 0x0000000000003FFFULL)
@@ -197,7 +199,7 @@ int32_t GetCodedUIntSize(uint64_t value) {
   return 8;
 }
 
-int32_t GetUIntSize(uint64_t value) {
+int32 GetUIntSize(uint64 value) {
   if (value < 0x0000000000000100ULL)
     return 1;
   else if (value < 0x0000000000010000ULL)
@@ -215,26 +217,26 @@ int32_t GetUIntSize(uint64_t value) {
   return 8;
 }
 
-int32_t GetIntSize(int64_t value) {
+int32 GetIntSize(int64 value) {
   // Doubling the requested value ensures positive values with their high bit
   // set are written with 0-padding to avoid flipping the signedness.
-  const uint64_t v = (value < 0) ? value ^ -1LL : value;
+  const uint64 v = (value < 0) ? value ^ -1LL : value;
   return GetUIntSize(2 * v);
 }
 
-uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value) {
+uint64 EbmlMasterElementSize(uint64 type, uint64 value) {
   // Size of EBML ID
-  int32_t ebml_size = GetUIntSize(type);
+  int32 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += GetCodedUIntSize(value);
 
-  return static_cast<uint64_t>(ebml_size);
+  return ebml_size;
 }
 
-uint64_t EbmlElementSize(uint64_t type, int64_t value) {
+uint64 EbmlElementSize(uint64 type, int64 value) {
   // Size of EBML ID
-  int32_t ebml_size = GetUIntSize(type);
+  int32 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += GetIntSize(value);
@@ -242,20 +244,19 @@ uint64_t EbmlElementSize(uint64_t type, int64_t value) {
   // Size of Datasize
   ebml_size++;
 
-  return static_cast<uint64_t>(ebml_size);
+  return ebml_size;
 }
 
-uint64_t EbmlElementSize(uint64_t type, uint64_t value) {
+uint64 EbmlElementSize(uint64 type, uint64 value) {
   return EbmlElementSize(type, value, 0);
 }
 
-uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size) {
+uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size) {
   // Size of EBML ID
-  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+  uint64 ebml_size = GetUIntSize(type);
 
   // Datasize
-  ebml_size +=
-      (fixed_size > 0) ? fixed_size : static_cast<uint64_t>(GetUIntSize(value));
+  ebml_size += (fixed_size > 0) ? fixed_size : GetUIntSize(value);
 
   // Size of Datasize
   ebml_size++;
@@ -263,9 +264,9 @@ uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size) {
   return ebml_size;
 }
 
-uint64_t EbmlElementSize(uint64_t type, float /* value */) {
+uint64 EbmlElementSize(uint64 type, float /* value */) {
   // Size of EBML ID
-  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+  uint64 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += sizeof(float);
@@ -276,12 +277,12 @@ uint64_t EbmlElementSize(uint64_t type, float /* value */) {
   return ebml_size;
 }
 
-uint64_t EbmlElementSize(uint64_t type, const char* value) {
+uint64 EbmlElementSize(uint64 type, const char* value) {
   if (!value)
     return 0;
 
   // Size of EBML ID
-  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+  uint64 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += strlen(value);
@@ -292,12 +293,12 @@ uint64_t EbmlElementSize(uint64_t type, const char* value) {
   return ebml_size;
 }
 
-uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size) {
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) {
   if (!value)
     return 0;
 
   // Size of EBML ID
-  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+  uint64 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += size;
@@ -308,9 +309,9 @@ uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size) {
   return ebml_size;
 }
 
-uint64_t EbmlDateElementSize(uint64_t type) {
+uint64 EbmlDateElementSize(uint64 type) {
   // Size of EBML ID
-  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+  uint64 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += kDateElementSize;
@@ -321,18 +322,18 @@ uint64_t EbmlDateElementSize(uint64_t type) {
   return ebml_size;
 }
 
-int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size) {
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) {
   if (!writer || size < 1 || size > 8)
     return -1;
 
-  for (int32_t i = 1; i <= size; ++i) {
-    const int32_t byte_count = size - i;
-    const int32_t bit_count = byte_count * 8;
+  for (int32 i = 1; i <= size; ++i) {
+    const int32 byte_count = size - i;
+    const int32 bit_count = byte_count * 8;
 
-    const int64_t bb = value >> bit_count;
-    const uint8_t b = static_cast<uint8_t>(bb);
+    const int64 bb = value >> bit_count;
+    const uint8 b = static_cast<uint8>(bb);
 
-    const int32_t status = writer->Write(&b, 1);
+    const int32 status = writer->Write(&b, 1);
 
     if (status < 0)
       return status;
@@ -341,26 +342,26 @@ int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size) {
   return 0;
 }
 
-int32_t SerializeFloat(IMkvWriter* writer, float f) {
+int32 SerializeFloat(IMkvWriter* writer, float f) {
   if (!writer)
     return -1;
 
-  assert(sizeof(uint32_t) == sizeof(float));
+  assert(sizeof(uint32) == sizeof(float));
   // This union is merely used to avoid a reinterpret_cast from float& to
   // uint32& which will result in violation of strict aliasing.
   union U32 {
-    uint32_t u32;
+    uint32 u32;
     float f;
   } value;
   value.f = f;
 
-  for (int32_t i = 1; i <= 4; ++i) {
-    const int32_t byte_count = 4 - i;
-    const int32_t bit_count = byte_count * 8;
+  for (int32 i = 1; i <= 4; ++i) {
+    const int32 byte_count = 4 - i;
+    const int32 bit_count = byte_count * 8;
 
-    const uint8_t byte = static_cast<uint8_t>(value.u32 >> bit_count);
+    const uint8 byte = static_cast<uint8>(value.u32 >> bit_count);
 
-    const int32_t status = writer->Write(&byte, 1);
+    const int32 status = writer->Write(&byte, 1);
 
     if (status < 0)
       return status;
@@ -369,21 +370,21 @@ int32_t SerializeFloat(IMkvWriter* writer, float f) {
   return 0;
 }
 
-int32_t WriteUInt(IMkvWriter* writer, uint64_t value) {
+int32 WriteUInt(IMkvWriter* writer, uint64 value) {
   if (!writer)
     return -1;
 
-  int32_t size = GetCodedUIntSize(value);
+  int32 size = GetCodedUIntSize(value);
 
   return WriteUIntSize(writer, value, size);
 }
 
-int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) {
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) {
   if (!writer || size < 0 || size > 8)
     return -1;
 
   if (size > 0) {
-    const uint64_t bit = 1LL << (size * 7);
+    const uint64 bit = 1LL << (size * 7);
 
     if (value > (bit - 2))
       return -1;
@@ -391,11 +392,11 @@ int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) {
     value |= bit;
   } else {
     size = 1;
-    int64_t bit;
+    int64 bit;
 
     for (;;) {
       bit = 1LL << (size * 7);
-      const uint64_t max = bit - 2;
+      const uint64 max = bit - 2;
 
       if (value <= max)
         break;
@@ -412,18 +413,18 @@ int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) {
   return SerializeInt(writer, value, size);
 }
 
-int32_t WriteID(IMkvWriter* writer, uint64_t type) {
+int32 WriteID(IMkvWriter* writer, uint64 type) {
   if (!writer)
     return -1;
 
   writer->ElementStartNotify(type, writer->Position());
 
-  const int32_t size = GetUIntSize(type);
+  const int32 size = GetUIntSize(type);
 
   return SerializeInt(writer, type, size);
 }
 
-bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t type, uint64_t size) {
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) {
   if (!writer)
     return false;
 
@@ -436,19 +437,19 @@ bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t type, uint64_t size) {
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value) {
   return WriteEbmlElement(writer, type, value, 0);
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value,
-                      uint64_t fixed_size) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value,
+                      uint64 fixed_size) {
   if (!writer)
     return false;
 
   if (WriteID(writer, type))
     return false;
 
-  uint64_t size = static_cast<uint64_t>(GetUIntSize(value));
+  uint64 size = GetUIntSize(value);
   if (fixed_size > 0) {
     if (size > fixed_size)
       return false;
@@ -457,30 +458,30 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value,
   if (WriteUInt(writer, size))
     return false;
 
-  if (SerializeInt(writer, value, static_cast<int32_t>(size)))
+  if (SerializeInt(writer, value, static_cast<int32>(size)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value) {
   if (!writer)
     return false;
 
   if (WriteID(writer, type))
     return 0;
 
-  const uint64_t size = GetIntSize(value);
+  const uint64 size = GetIntSize(value);
   if (WriteUInt(writer, size))
     return false;
 
-  if (SerializeInt(writer, value, static_cast<int32_t>(size)))
+  if (SerializeInt(writer, value, static_cast<int32>(size)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) {
   if (!writer)
     return false;
 
@@ -496,25 +497,25 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value) {
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) {
   if (!writer || !value)
     return false;
 
   if (WriteID(writer, type))
     return false;
 
-  const uint64_t length = strlen(value);
+  const uint64 length = strlen(value);
   if (WriteUInt(writer, length))
     return false;
 
-  if (writer->Write(value, static_cast<const uint32_t>(length)))
+  if (writer->Write(value, static_cast<const uint32>(length)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value,
-                      uint64_t size) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+                      uint64 size) {
   if (!writer || !value || size < 1)
     return false;
 
@@ -524,13 +525,13 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value,
   if (WriteUInt(writer, size))
     return false;
 
-  if (writer->Write(value, static_cast<uint32_t>(size)))
+  if (writer->Write(value, static_cast<uint32>(size)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value) {
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) {
   if (!writer)
     return false;
 
@@ -546,8 +547,8 @@ bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value) {
   return true;
 }
 
-uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
-                    Cluster* cluster) {
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster) {
   if (!writer || !frame || !frame->IsValid() || !cluster ||
       !cluster->timecode_scale())
     return 0;
@@ -556,7 +557,7 @@ uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
   //  timecode for the cluster itself (remember that block timecode
   //  is a signed, 16-bit integer).  However, as a simplification we
   //  only permit non-negative cluster-relative timecodes for blocks.
-  const int64_t relative_timecode = cluster->GetRelativeTimecode(
+  const int64 relative_timecode = cluster->GetRelativeTimecode(
       frame->timestamp() / cluster->timecode_scale());
   if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
     return 0;
@@ -567,20 +568,19 @@ uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
                         cluster->timecode_scale());
 }
 
-uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size) {
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
   if (!writer)
     return false;
 
   // Subtract one for the void ID and the coded size.
-  uint64_t void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
-  uint64_t void_size =
-      EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) +
-      void_entry_size;
+  uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
+  uint64 void_size = EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) +
+                     void_entry_size;
 
   if (void_size != size)
     return 0;
 
-  const int64_t payload_position = writer->Position();
+  const int64 payload_position = writer->Position();
   if (payload_position < 0)
     return 0;
 
@@ -590,30 +590,29 @@ uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size) {
   if (WriteUInt(writer, void_entry_size))
     return 0;
 
-  const uint8_t value = 0;
-  for (int32_t i = 0; i < static_cast<int32_t>(void_entry_size); ++i) {
+  const uint8 value = 0;
+  for (int32 i = 0; i < static_cast<int32>(void_entry_size); ++i) {
     if (writer->Write(&value, 1))
       return 0;
   }
 
-  const int64_t stop_position = writer->Position();
+  const int64 stop_position = writer->Position();
   if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64_t>(void_size))
+      stop_position - payload_position != static_cast<int64>(void_size))
     return 0;
 
   return void_size;
 }
 
-void GetVersion(int32_t* major, int32_t* minor, int32_t* build,
-                int32_t* revision) {
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
   *major = 0;
   *minor = 2;
   *build = 1;
   *revision = 0;
 }
 
-uint64_t MakeUID(unsigned int* seed) {
-  uint64_t uid = 0;
+uint64 MakeUID(unsigned int* seed) {
+  uint64 uid = 0;
 
 #ifdef __MINGW32__
   srand(*seed);
@@ -625,21 +624,22 @@ uint64_t MakeUID(unsigned int* seed) {
 // TODO(fgalligan): Move random number generation to platform specific code.
 #ifdef _MSC_VER
     (void)seed;
-    const int32_t nn = rand();
+    const int32 nn = rand();
 #elif __ANDROID__
-    int32_t temp_num = 1;
+    (void)seed;
+    int32 temp_num = 1;
     int fd = open("/dev/urandom", O_RDONLY);
     if (fd != -1) {
       read(fd, &temp_num, sizeof(temp_num));
       close(fd);
     }
-    const int32_t nn = temp_num;
+    const int32 nn = temp_num;
 #elif defined __MINGW32__
-    const int32_t nn = rand();
+    const int32 nn = rand();
 #else
-    const int32_t nn = rand_r(seed);
+    const int32 nn = rand_r(seed);
 #endif
-    const int32_t n = 0xFF & (nn >> 4);  // throw away low-order bits
+    const int32 n = 0xFF & (nn >> 4);  // throw away low-order bits
 
     uid |= n;
   }
@@ -647,4 +647,97 @@ uint64_t MakeUID(unsigned int* seed) {
   return uid;
 }
 
+bool IsMatrixCoefficientsValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kGbr:
+    case mkvmuxer::Colour::kBt709:
+    case mkvmuxer::Colour::kUnspecifiedMc:
+    case mkvmuxer::Colour::kReserved:
+    case mkvmuxer::Colour::kFcc:
+    case mkvmuxer::Colour::kBt470bg:
+    case mkvmuxer::Colour::kSmpte170MMc:
+    case mkvmuxer::Colour::kSmpte240MMc:
+    case mkvmuxer::Colour::kYcocg:
+    case mkvmuxer::Colour::kBt2020NonConstantLuminance:
+    case mkvmuxer::Colour::kBt2020ConstantLuminance:
+      return true;
+  }
+  return false;
+}
+
+bool IsChromaSitingHorzValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCsh:
+    case mkvmuxer::Colour::kLeftCollocated:
+    case mkvmuxer::Colour::kHalfCsh:
+      return true;
+  }
+  return false;
+}
+
+bool IsChromaSitingVertValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCsv:
+    case mkvmuxer::Colour::kTopCollocated:
+    case mkvmuxer::Colour::kHalfCsv:
+      return true;
+  }
+  return false;
+}
+
+bool IsColourRangeValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCr:
+    case mkvmuxer::Colour::kBroadcastRange:
+    case mkvmuxer::Colour::kFullRange:
+    case mkvmuxer::Colour::kMcTcDefined:
+      return true;
+  }
+  return false;
+}
+
+bool IsTransferCharacteristicsValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kIturBt709Tc:
+    case mkvmuxer::Colour::kUnspecifiedTc:
+    case mkvmuxer::Colour::kReservedTc:
+    case mkvmuxer::Colour::kGamma22Curve:
+    case mkvmuxer::Colour::kGamma28Curve:
+    case mkvmuxer::Colour::kSmpte170MTc:
+    case mkvmuxer::Colour::kSmpte240MTc:
+    case mkvmuxer::Colour::kLinear:
+    case mkvmuxer::Colour::kLog:
+    case mkvmuxer::Colour::kLogSqrt:
+    case mkvmuxer::Colour::kIec6196624:
+    case mkvmuxer::Colour::kIturBt1361ExtendedColourGamut:
+    case mkvmuxer::Colour::kIec6196621:
+    case mkvmuxer::Colour::kIturBt202010bit:
+    case mkvmuxer::Colour::kIturBt202012bit:
+    case mkvmuxer::Colour::kSmpteSt2084:
+    case mkvmuxer::Colour::kSmpteSt4281Tc:
+    case mkvmuxer::Colour::kAribStdB67Hlg:
+      return true;
+  }
+  return false;
+}
+
+bool IsPrimariesValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kReservedP0:
+    case mkvmuxer::Colour::kIturBt709P:
+    case mkvmuxer::Colour::kUnspecifiedP:
+    case mkvmuxer::Colour::kReservedP3:
+    case mkvmuxer::Colour::kIturBt470M:
+    case mkvmuxer::Colour::kIturBt470Bg:
+    case mkvmuxer::Colour::kSmpte170MP:
+    case mkvmuxer::Colour::kSmpte240MP:
+    case mkvmuxer::Colour::kFilm:
+    case mkvmuxer::Colour::kIturBt2020:
+    case mkvmuxer::Colour::kSmpteSt4281P:
+    case mkvmuxer::Colour::kJedecP22Phosphors:
+      return true;
+  }
+  return false;
+}
+
 }  // namespace mkvmuxer
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
index 0e21a2dcbe5..132388da599 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
@@ -8,87 +8,104 @@
 #ifndef MKVMUXER_MKVMUXERUTIL_H_
 #define MKVMUXER_MKVMUXERUTIL_H_
 
-#include <stdint.h>
+#include "mkvmuxertypes.h"
+
+#include "stdint.h"
 
 namespace mkvmuxer {
 class Cluster;
 class Frame;
 class IMkvWriter;
 
-const uint64_t kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
-const int64_t kMaxBlockTimecode = 0x07FFFLL;
+// TODO(tomfinegan): mkvmuxer:: integer types continue to be used here because
+// changing them causes pain for downstream projects. It would be nice if a
+// solution that allows removal of the mkvmuxer:: integer types while avoiding
+// pain for downstream users of libwebm. Considering that mkvmuxerutil.{cc,h}
+// are really, for the great majority of cases, EBML size calculation and writer
+// functions, perhaps a more EBML focused utility would be the way to go as a
+// first step.
+
+const uint64 kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
+const int64 kMaxBlockTimecode = 0x07FFFLL;
 
 // Writes out |value| in Big Endian order. Returns 0 on success.
-int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size);
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size);
 
 // Returns the size in bytes of the element.
-int32_t GetUIntSize(uint64_t value);
-int32_t GetIntSize(int64_t value);
-int32_t GetCodedUIntSize(uint64_t value);
-uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value);
-uint64_t EbmlElementSize(uint64_t type, int64_t value);
-uint64_t EbmlElementSize(uint64_t type, uint64_t value);
-uint64_t EbmlElementSize(uint64_t type, float value);
-uint64_t EbmlElementSize(uint64_t type, const char* value);
-uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size);
-uint64_t EbmlDateElementSize(uint64_t type);
+int32 GetUIntSize(uint64 value);
+int32 GetIntSize(int64 value);
+int32 GetCodedUIntSize(uint64 value);
+uint64 EbmlMasterElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, int64 value);
+uint64 EbmlElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, float value);
+uint64 EbmlElementSize(uint64 type, const char* value);
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size);
+uint64 EbmlDateElementSize(uint64 type);
 
 // Returns the size in bytes of the element assuming that the element was
 // written using |fixed_size| bytes. If |fixed_size| is set to zero, then it
 // computes the necessary number of bytes based on |value|.
-uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size);
+uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size);
 
 // Creates an EBML coded number from |value| and writes it out. The size of
 // the coded number is determined by the value of |value|. |value| must not
 // be in a coded form. Returns 0 on success.
-int32_t WriteUInt(IMkvWriter* writer, uint64_t value);
+int32 WriteUInt(IMkvWriter* writer, uint64 value);
 
 // Creates an EBML coded number from |value| and writes it out. The size of
 // the coded number is determined by the value of |size|. |value| must not
 // be in a coded form. Returns 0 on success.
-int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size);
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size);
 
 // Output an Mkv master element. Returns true if the element was written.
-bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t value, uint64_t size);
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 value, uint64 size);
 
 // Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the
 // ID to |SerializeInt|. Returns 0 on success.
-int32_t WriteID(IMkvWriter* writer, uint64_t type);
+int32 WriteID(IMkvWriter* writer, uint64 type);
 
 // Output an Mkv non-master element. Returns true if the element was written.
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value,
-                      uint64_t size);
-bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+                      uint64 size);
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value);
 
 // Output an Mkv non-master element using fixed size. The element will be
 // written out using exactly |fixed_size| bytes. If |fixed_size| is set to zero
 // then it computes the necessary number of bytes based on |value|. Returns true
 // if the element was written.
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value,
-                      uint64_t fixed_size);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value,
+                      uint64 fixed_size);
 
 // Output a Mkv Frame. It decides the correct element to write (Block vs
 // SimpleBlock) based on the parameters of the Frame.
-uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
-                    Cluster* cluster);
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster);
 
 // Output a void element. |size| must be the entire size in bytes that will be
 // void. The function will calculate the size of the void header and subtract
 // it from |size|.
-uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size);
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size);
 
 // Returns the version number of the muxer in |major|, |minor|, |build|,
 // and |revision|.
-void GetVersion(int32_t* major, int32_t* minor, int32_t* build,
-                int32_t* revision);
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision);
 
 // Returns a random number to be used for UID, using |seed| to seed
 // the random-number generator (see POSIX rand_r() for semantics).
-uint64_t MakeUID(unsigned int* seed);
+uint64 MakeUID(unsigned int* seed);
+
+// Colour field validation helpers. All return true when |value| is valid.
+bool IsMatrixCoefficientsValueValid(uint64_t value);
+bool IsChromaSitingHorzValueValid(uint64_t value);
+bool IsChromaSitingVertValueValid(uint64_t value);
+bool IsColourRangeValueValid(uint64_t value);
+bool IsTransferCharacteristicsValueValid(uint64_t value);
+bool IsPrimariesValueValid(uint64_t value);
 
 }  // namespace mkvmuxer
 
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
index ca48e149c6d..ec34e4df818 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -77,7 +77,7 @@ int32 MkvWriter::Position(int64 position) {
 #ifdef _MSC_VER
   return _fseeki64(file_, position, SEEK_SET);
 #else
-  return fseek(file_, position, SEEK_SET);
+  return fseeko(file_, static_cast<off_t>(position), SEEK_SET);
 #endif
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
index 21801154d9f..e62d6f6075c 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
@@ -25,6 +25,7 @@
 namespace mkvparser {
 const float MasteringMetadata::kValueNotPresent = FLT_MAX;
 const long long Colour::kValueNotPresent = LLONG_MAX;
+const float Projection::kValueNotPresent = FLT_MAX;
 
 #ifdef MSC_COMPAT
 inline bool isnan(double val) { return !!_isnan(val); }
@@ -1475,6 +1476,8 @@ long Segment::Load() {
   }
 }
 
+SeekHead::Entry::Entry() : id(0), pos(0), element_start(0), element_size(0) {}
+
 SeekHead::SeekHead(Segment* pSegment, long long start, long long size_,
                    long long element_start, long long element_size)
     : m_pSegment(pSegment),
@@ -1766,18 +1769,7 @@ bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
   if ((pos + seekIdSize) > stop)
     return false;
 
-  // Note that the SeekId payload really is serialized
-  // as a "Matroska integer", not as a plain binary value.
-  // In fact, Matroska requires that ID values in the
-  // stream exactly match the binary representation as listed
-  // in the Matroska specification.
-  //
-  // This parser is more liberal, and permits IDs to have
-  // any width.  (This could make the representation in the stream
-  // different from what's in the spec, but it doesn't matter here,
-  // since we always normalize "Matroska integer" values.)
-
-  pEntry->id = ReadUInt(pReader, pos, len);  // payload
+  pEntry->id = ReadID(pReader, pos, len);  // payload
 
   if (pEntry->id <= 0)
     return false;
@@ -4125,7 +4117,7 @@ ContentEncoding::~ContentEncoding() {
 }
 
 const ContentEncoding::ContentCompression*
-    ContentEncoding::GetCompressionByIndex(unsigned long idx) const {
+ContentEncoding::GetCompressionByIndex(unsigned long idx) const {
   const ptrdiff_t count = compression_entries_end_ - compression_entries_;
   assert(count >= 0);
 
@@ -5188,11 +5180,92 @@ bool Colour::Parse(IMkvReader* reader, long long colour_start,
   return true;
 }
 
+bool Projection::Parse(IMkvReader* reader, long long start, long long size,
+                       Projection** projection) {
+  if (!reader || *projection)
+    return false;
+
+  std::auto_ptr<Projection> projection_ptr(new Projection());
+  if (!projection_ptr.get())
+    return false;
+
+  const long long end = start + size;
+  long long read_pos = start;
+
+  while (read_pos < end) {
+    long long child_id = 0;
+    long long child_size = 0;
+
+    const long long status =
+        ParseElementHeader(reader, read_pos, end, child_id, child_size);
+    if (status < 0)
+      return false;
+
+    if (child_id == libwebm::kMkvProjectionType) {
+      long long projection_type = kTypeNotPresent;
+      projection_type = UnserializeUInt(reader, read_pos, child_size);
+      if (projection_type < 0)
+        return false;
+
+      projection_ptr->type = static_cast<ProjectionType>(projection_type);
+    } else if (child_id == libwebm::kMkvProjectionPrivate) {
+      unsigned char* data = SafeArrayAlloc<unsigned char>(1, child_size);
+
+      if (data == NULL)
+        return false;
+
+      const int status =
+          reader->Read(read_pos, static_cast<long>(child_size), data);
+
+      if (status) {
+        delete[] data;
+        return false;
+      }
+
+      projection_ptr->private_data = data;
+      projection_ptr->private_data_length = static_cast<size_t>(child_size);
+    } else {
+      double value = 0;
+      const long long value_parse_status =
+          UnserializeFloat(reader, read_pos, child_size, value);
+      if (value_parse_status < 0) {
+        return false;
+      }
+
+      switch (child_id) {
+        case libwebm::kMkvProjectionPoseYaw:
+          projection_ptr->pose_yaw = static_cast<float>(value);
+          break;
+        case libwebm::kMkvProjectionPosePitch:
+          projection_ptr->pose_pitch = static_cast<float>(value);
+          break;
+        case libwebm::kMkvProjectionPoseRoll:
+          projection_ptr->pose_roll = static_cast<float>(value);
+          break;
+        default:
+          return false;
+      }
+    }
+
+    read_pos += child_size;
+    if (read_pos > end)
+      return false;
+  }
+
+  *projection = projection_ptr.release();
+  return true;
+}
+
 VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
                        long long element_size)
-    : Track(pSegment, element_start, element_size), m_colour(NULL) {}
+    : Track(pSegment, element_start, element_size),
+      m_colour(NULL),
+      m_projection(NULL) {}
 
-VideoTrack::~VideoTrack() { delete m_colour; }
+VideoTrack::~VideoTrack() {
+  delete m_colour;
+  delete m_projection;
+}
 
 long VideoTrack::Parse(Segment* pSegment, const Info& info,
                        long long element_start, long long element_size,
@@ -5224,6 +5297,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   const long long stop = pos + s.size;
 
   Colour* colour = NULL;
+  Projection* projection = NULL;
 
   while (pos < stop) {
     long long id, size;
@@ -5274,6 +5348,9 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
     } else if (id == libwebm::kMkvColour) {
       if (!Colour::Parse(pReader, pos, size, &colour))
         return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvProjection) {
+      if (!Projection::Parse(pReader, pos, size, &projection))
+        return E_FILE_FORMAT_INVALID;
     }
 
     pos += size;  // consume payload
@@ -5305,6 +5382,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
   pTrack->m_colour = colour;
+  pTrack->m_projection = projection;
 
   pResult = pTrack;
   return 0;  // success
@@ -5405,6 +5483,8 @@ long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const {
 
 Colour* VideoTrack::GetColour() const { return m_colour; }
 
+Projection* VideoTrack::GetProjection() const { return m_projection; }
+
 long long VideoTrack::GetWidth() const { return m_width; }
 
 long long VideoTrack::GetHeight() const { return m_height; }
@@ -6698,8 +6778,10 @@ Cluster::Cluster(Segment* pSegment, long idx, long long element_start
 {}
 
 Cluster::~Cluster() {
-  if (m_entries_count <= 0)
+  if (m_entries_count <= 0) {
+    delete[] m_entries;
     return;
+  }
 
   BlockEntry** i = m_entries;
   BlockEntry** const j = m_entries + m_entries_count;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.h b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.h
index 42e6e88ab46..26c2b7e5ebf 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.h
@@ -473,6 +473,34 @@ struct Colour {
   MasteringMetadata* mastering_metadata;
 };
 
+struct Projection {
+  enum ProjectionType {
+    kTypeNotPresent = -1,
+    kRectangular = 0,
+    kEquirectangular = 1,
+    kCubeMap = 2,
+    kMesh = 3,
+  };
+  static const float kValueNotPresent;
+  Projection()
+      : type(kTypeNotPresent),
+        private_data(NULL),
+        private_data_length(0),
+        pose_yaw(kValueNotPresent),
+        pose_pitch(kValueNotPresent),
+        pose_roll(kValueNotPresent) {}
+  ~Projection() { delete[] private_data; }
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size, Projection** projection);
+
+  ProjectionType type;
+  unsigned char* private_data;
+  size_t private_data_length;
+  float pose_yaw;
+  float pose_pitch;
+  float pose_roll;
+};
+
 class VideoTrack : public Track {
   VideoTrack(const VideoTrack&);
   VideoTrack& operator=(const VideoTrack&);
@@ -497,6 +525,8 @@ class VideoTrack : public Track {
 
   Colour* GetColour() const;
 
+  Projection* GetProjection() const;
+
  private:
   long long m_width;
   long long m_height;
@@ -508,6 +538,7 @@ class VideoTrack : public Track {
   double m_rate;
 
   Colour* m_colour;
+  Projection* m_projection;
 };
 
 class AudioTrack : public Track {
@@ -813,6 +844,8 @@ class SeekHead {
   long Parse();
 
   struct Entry {
+    Entry();
+
     // the SeekHead entry payload
     long long id;
     long long pos;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvreader.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
index 9f90d8c4f86..b8fd00c2635 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
@@ -117,7 +117,7 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
   if (status)
     return -1;  // error
 #else
-  fseek(m_file, offset, SEEK_SET);
+  fseeko(m_file, static_cast<off_t>(offset), SEEK_SET);
 #endif
 
   const size_t size = fread(buffer, 1, len, m_file);
@@ -128,4 +128,4 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
   return 0;  // success
 }
 
-}  // namespace mkvparser
-\ No newline at end of file
+}  // namespace mkvparser
diff --git a/chromium/third_party/libvpx/source/libvpx/tools.mk b/chromium/third_party/libvpx/source/libvpx/tools.mk
new file mode 100644
index 00000000000..3c660b1dfd5
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/tools.mk
@@ -0,0 +1,110 @@
+##
+##  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+# List of tools to build.
+TOOLS-yes            += tiny_ssim.c
+tiny_ssim.SRCS       += vpx/vpx_integer.h
+tiny_ssim.GUID        = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4
+tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files
+
+#
+# End of specified files. The rest of the build rules should happen
+# automagically from here.
+#
+
+
+# Expand list of selected tools to build (as specified above)
+TOOLS           = $(addprefix tools/,$(call enabled,TOOLS))
+ALL_SRCS        = $(foreach ex,$(TOOLS),$($(notdir $(ex:.c=)).SRCS))
+
+
+# Expand all tools sources into a variable containing all sources
+# for that tools (not just them main one specified in TOOLS)
+# and add this file to the list (for MSVS workspace generation)
+$(foreach ex,$(TOOLS),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) tools.mk))
+
+
+# Create build/install dependencies for all tools. The common case
+# is handled here. The MSVS case is handled below.
+NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
+DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(TOOLS:.c=$(EXE_SFX)))
+DIST-SRCS-yes              += $(ALL_SRCS)
+OBJS-$(NOT_MSVS)           += $(call objs,$(ALL_SRCS))
+BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(TOOLS:.c=$(EXE_SFX)))
+
+
+# Instantiate linker template for all tools.
+$(foreach bin,$(BINS-yes),\
+    $(eval $(bin):)\
+    $(eval $(call linker_template,$(bin),\
+        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
+				-lm\
+        )))
+
+
+# The following pairs define a mapping of locations in the distribution
+# tree to locations in the source/build trees.
+INSTALL_MAPS += src/%.c   %.c
+INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%
+INSTALL_MAPS += bin/%     %
+INSTALL_MAPS += %         %
+
+
+# Build Visual Studio Projects. We use a template here to instantiate
+# explicit rules rather than using an implicit rule because we want to
+# leverage make's VPATH searching rather than specifying the paths on
+# each file in TOOLS. This has the unfortunate side effect that
+# touching the source files trigger a rebuild of the project files
+# even though there is no real dependency there (the dependency is on
+# the makefiles). We may want to revisit this.
+define vcproj_template
+$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
+	$(if $(quiet),@echo "    [vcproj] $$@")
+	$(qexec)$$(GEN_VCPROJ)\
+            --exe\
+            --target=$$(TOOLCHAIN)\
+            --name=$$(@:.$(VCPROJ_SFX)=)\
+            --ver=$$(CONFIG_VS_VERSION)\
+            --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
+            $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^
+endef
+TOOLS_BASENAME := $(notdir $(TOOLS))
+PROJECTS-$(CONFIG_MSVS) += $(TOOLS_BASENAME:.c=.$(VCPROJ_SFX))
+INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\
+                               $(addprefix bin/$(p)/,$(TOOLS_BASENAME:.c=.exe)))
+$(foreach proj,$(call enabled,PROJECTS),\
+    $(eval $(call vcproj_template,$(proj))))
+
+#
+# Documentation Rules
+#
+%.dox: %.c
+	@echo "    [DOXY] $@"
+	@mkdir -p $(dir $@)
+	@echo "/*!\page tools_$(@F:.dox=) $(@F:.dox=)" > $@
+	@echo "   \includelineno $(<F)" >> $@
+	@echo "*/" >> $@
+
+tools.dox: tools.mk
+	@echo "    [DOXY] $@"
+	@echo "/*!\page tools Tools" > $@
+	@echo "    This SDK includes a number of tools/utilities."\
+	      "The following tools are included: ">>$@
+	@$(foreach ex,$(sort $(notdir $(TOOLS:.c=))),\
+	   echo "     - \subpage tools_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+	@echo "*/" >> $@
+
+CLEAN-OBJS += tools.doxy tools.dox $(TOOLS:.c=.dox)
+DOCS-yes += tools.doxy tools.dox
+tools.doxy: tools.dox $(TOOLS:.c=.dox)
+	@echo "INPUT += $^" > $@
diff --git a/chromium/third_party/libvpx/source/libvpx/tools/tiny_ssim.c b/chromium/third_party/libvpx/source/libvpx/tools/tiny_ssim.c
new file mode 100644
index 00000000000..28052e0a84d
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/tools/tiny_ssim.c
@@ -0,0 +1,200 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vpx/vpx_integer.h"
+
+void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
+                          uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                          uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+static const int64_t cc1 = 26634;   // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+                         uint32_t sum_sq_r, uint32_t sum_sxr, int count) {
+  int64_t ssim_n, ssim_d;
+  int64_t c1, c2;
+
+  // scale the constants by number of pixels
+  c1 = (cc1 * count * count) >> 12;
+  c2 = (cc2 * count * count) >> 12;
+
+  ssim_n = (2 * sum_s * sum_r + c1) *
+           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+
+  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+
+  return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                       &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
+                 int stride_img2, int width, int height) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
+                                 int recon_stride, unsigned int cols,
+                                 unsigned int rows) {
+  unsigned int row, col;
+  uint64_t total_sse = 0;
+  int diff;
+
+  for (row = 0; row < rows; row++) {
+    for (col = 0; col < cols; col++) {
+      diff = orig[col] - recon[col];
+      total_sse += diff * diff;
+    }
+
+    orig += orig_stride;
+    recon += recon_stride;
+  }
+
+  return total_sse;
+}
+
+#define MAX_PSNR 100
+
+double vp9_mse2psnr(double samples, double peak, double mse) {
+  double psnr;
+
+  if (mse > 0.0)
+    psnr = 10.0 * log10(peak * peak * samples / mse);
+  else
+    psnr = MAX_PSNR;  // Limit to prevent / 0
+
+  if (psnr > MAX_PSNR) psnr = MAX_PSNR;
+
+  return psnr;
+}
+
+int main(int argc, char *argv[]) {
+  FILE *f[2];
+  uint8_t *buf[2];
+  int w, h, n_frames, tl_skip = 0, tl_skips_remaining = 0;
+  double ssim = 0, psnravg = 0, psnrglb = 0;
+  double ssimy, ssimu, ssimv;
+  uint64_t psnry, psnru, psnrv;
+
+  if (argc < 4) {
+    fprintf(stderr, "Usage: %s file1.yuv file2.yuv WxH [tl_skip={0,1,3}]\n",
+            argv[0]);
+    return 1;
+  }
+  f[0] = strcmp(argv[1], "-") ? fopen(argv[1], "rb") : stdin;
+  f[1] = strcmp(argv[2], "-") ? fopen(argv[2], "rb") : stdin;
+  sscanf(argv[3], "%dx%d", &w, &h);
+  // Number of frames to skip from file1.yuv for every frame used. Normal values
+  // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding
+  // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding.
+  if (argc > 4) {
+    sscanf(argv[4], "%d", &tl_skip);
+  }
+  if (!f[0] || !f[1]) {
+    fprintf(stderr, "Could not open input files: %s\n", strerror(errno));
+    return 1;
+  }
+  if (w <= 0 || h <= 0 || w & 1 || h & 1) {
+    fprintf(stderr, "Invalid size %dx%d\n", w, h);
+    return 1;
+  }
+  buf[0] = malloc(w * h * 3 / 2);
+  buf[1] = malloc(w * h * 3 / 2);
+  n_frames = 0;
+  while (1) {
+    size_t r1, r2;
+    r1 = fread(buf[0], w * h * 3 / 2, 1, f[0]);
+    if (r1) {
+      // Reading parts of file1.yuv that were not used in temporal layer.
+      if (tl_skips_remaining > 0) {
+        --tl_skips_remaining;
+        continue;
+      }
+      // Use frame, but skip |tl_skip| after it.
+      tl_skips_remaining = tl_skip;
+    }
+    r2 = fread(buf[1], w * h * 3 / 2, 1, f[1]);
+    if (r1 && r2 && r1 != r2) {
+      fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno),
+              (int)r1, (int)r2);
+      return 1;
+    } else if (r1 == 0 || r2 == 0) {
+      break;
+    }
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
+  ssim = vp8_ssim2(buf0, buf1, w, w, w, h);         \
+  psnr = calc_plane_error(buf0, w, buf1, w, w, h);
+    psnr_and_ssim(ssimy, psnry, buf[0], buf[1], w, h);
+    psnr_and_ssim(ssimu, psnru, buf[0] + w * h, buf[1] + w * h, w / 2, h / 2);
+    psnr_and_ssim(ssimv, psnrv, buf[0] + w * h * 5 / 4, buf[1] + w * h * 5 / 4,
+                  w / 2, h / 2);
+    ssim += 0.8 * ssimy + 0.1 * (ssimu + ssimv);
+    psnravg +=
+        vp9_mse2psnr(w * h * 6 / 4, 255.0, (double)psnry + psnru + psnrv);
+    psnrglb += psnry + psnru + psnrv;
+    n_frames++;
+  }
+  free(buf[0]);
+  free(buf[1]);
+  ssim /= n_frames;
+  psnravg /= n_frames;
+  psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb);
+
+  printf("AvgPSNR: %lf\n", psnravg);
+  printf("GlbPSNR: %lf\n", psnrglb);
+  printf("SSIM: %lf\n", 100 * pow(ssim, 8.0));
+  printf("Nframes: %d\n", n_frames);
+
+  if (strcmp(argv[1], "-")) fclose(f[0]);
+  if (strcmp(argv[2], "-")) fclose(f[1]);
+
+  return 0;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter_filters.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter_filters.c
index 1f60721e1cd..2a7cde8788f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter_filters.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter_filters.c
@@ -63,8 +63,8 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0,
   filter_value &= mask;
 
   /* save bottom 3 bits so that we round one side +4 and the other +3
-   * if it equals 4 we'll set to adjust by -1 to account for the fact
-   * we'd round 3 the other way
+   * if it equals 4 we'll set it to adjust by -1 to account for the fact
+   * we'd round it by 3 the other way
    */
   Filter1 = vp8_signed_char_clamp(filter_value + 4);
   Filter2 = vp8_signed_char_clamp(filter_value + 3);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/idct_msa.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/idct_msa.c
index e1759c875e4..3d516d0f81a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/idct_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/idct_msa.c
@@ -90,8 +90,7 @@ static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred,
   v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
   v4i32 res0, res1, res2, res3;
   v16i8 zero = { 0 };
-  v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
-  v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+  v16i8 pred0, pred1, pred2, pred3;
 
   LD_SH2(input, 8, input0, input1);
   UNPCK_SH_SW(input0, in0, in1);
@@ -111,20 +110,17 @@ static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred,
   res1 = CLIP_SW_0_255(res1);
   res2 = CLIP_SW_0_255(res2);
   res3 = CLIP_SW_0_255(res3);
-  LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
-  VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
-  VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
-  ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride);
 }
 
 static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
                                  int32_t pred_stride, uint8_t *dest,
                                  int32_t dest_stride) {
-  v8i16 vec;
-  v8i16 res0, res1, res2, res3;
+  v8i16 vec, res0, res1, res2, res3, dst0, dst1;
   v16i8 zero = { 0 };
-  v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
-  v16i8 mask = { 0, 2, 4, 6, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+  v16i8 pred0, pred1, pred2, pred3;
 
   vec = __msa_fill_h(in_dc);
   vec = __msa_srari_h(vec, 3);
@@ -133,55 +129,59 @@ static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
              res2, res3);
   ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
   CLIP_SH4_0_255(res0, res1, res2, res3);
-  LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
-  VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
-  VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
-  ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SH(res1, res0, res3, res2, dst0, dst1);
+  dst0 = (v8i16)__msa_pckev_w((v4i32)dst1, (v4i32)dst0);
+  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dest, dest_stride);
 }
 
 void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) {
-  v8i16 input0, input1;
-  v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
-  v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+  v8i16 input0, input1, tmp0, tmp1, tmp2, tmp3, out0, out1;
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
 
   LD_SH2(input, 8, input0, input1);
-  UNPCK_SH_SW(input0, in0, in1);
-  UNPCK_SH_SW(input1, in2, in3);
-  BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
-  BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
-  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
-  BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
-  BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
-  ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
-  SRA_4V(vt0, vt1, vt2, vt3, 3);
-  mb_dq_coeff[0] = __msa_copy_s_h((v8i16)vt0, 0);
-  mb_dq_coeff[16] = __msa_copy_s_h((v8i16)vt1, 0);
-  mb_dq_coeff[32] = __msa_copy_s_h((v8i16)vt2, 0);
-  mb_dq_coeff[48] = __msa_copy_s_h((v8i16)vt3, 0);
-  mb_dq_coeff[64] = __msa_copy_s_h((v8i16)vt0, 2);
-  mb_dq_coeff[80] = __msa_copy_s_h((v8i16)vt1, 2);
-  mb_dq_coeff[96] = __msa_copy_s_h((v8i16)vt2, 2);
-  mb_dq_coeff[112] = __msa_copy_s_h((v8i16)vt3, 2);
-  mb_dq_coeff[128] = __msa_copy_s_h((v8i16)vt0, 4);
-  mb_dq_coeff[144] = __msa_copy_s_h((v8i16)vt1, 4);
-  mb_dq_coeff[160] = __msa_copy_s_h((v8i16)vt2, 4);
-  mb_dq_coeff[176] = __msa_copy_s_h((v8i16)vt3, 4);
-  mb_dq_coeff[192] = __msa_copy_s_h((v8i16)vt0, 6);
-  mb_dq_coeff[208] = __msa_copy_s_h((v8i16)vt1, 6);
-  mb_dq_coeff[224] = __msa_copy_s_h((v8i16)vt2, 6);
-  mb_dq_coeff[240] = __msa_copy_s_h((v8i16)vt3, 6);
+  input1 = (v8i16)__msa_sldi_b((v16i8)input1, (v16i8)input1, 8);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  out0 = tmp2 + tmp3;
+  out1 = tmp2 - tmp3;
+  VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  tmp0 = tmp2 + tmp3;
+  tmp1 = tmp2 - tmp3;
+  ADD2(tmp0, 3, tmp1, 3, out0, out1);
+  out0 >>= 3;
+  out1 >>= 3;
+  mb_dq_coeff[0] = __msa_copy_s_h(out0, 0);
+  mb_dq_coeff[16] = __msa_copy_s_h(out0, 4);
+  mb_dq_coeff[32] = __msa_copy_s_h(out1, 0);
+  mb_dq_coeff[48] = __msa_copy_s_h(out1, 4);
+  mb_dq_coeff[64] = __msa_copy_s_h(out0, 1);
+  mb_dq_coeff[80] = __msa_copy_s_h(out0, 5);
+  mb_dq_coeff[96] = __msa_copy_s_h(out1, 1);
+  mb_dq_coeff[112] = __msa_copy_s_h(out1, 5);
+  mb_dq_coeff[128] = __msa_copy_s_h(out0, 2);
+  mb_dq_coeff[144] = __msa_copy_s_h(out0, 6);
+  mb_dq_coeff[160] = __msa_copy_s_h(out1, 2);
+  mb_dq_coeff[176] = __msa_copy_s_h(out1, 6);
+  mb_dq_coeff[192] = __msa_copy_s_h(out0, 3);
+  mb_dq_coeff[208] = __msa_copy_s_h(out0, 7);
+  mb_dq_coeff[224] = __msa_copy_s_h(out1, 3);
+  mb_dq_coeff[240] = __msa_copy_s_h(out1, 7);
 }
 
 static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
                                        uint8_t *dest, int32_t dest_stride) {
   v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1;
-  v8i16 in0, in1, in2, in3;
-  v8i16 hz0_h, hz1_h, hz2_h, hz3_h;
-  v16i8 dest0, dest1, dest2, dest3;
-  v4i32 hz0_w, hz1_w, hz2_w, hz3_w;
-  v4i32 vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+  v8i16 in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h;
+  v16u8 dest0, dest1, dest2, dest3;
+  v4i32 hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
   v2i64 zero = { 0 };
-  v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
 
   LD_SH2(input, 8, input0, input1);
   LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
@@ -196,7 +196,7 @@ static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
   VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3);
   SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
   TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
-  LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+  LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
   ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
              res2, res3);
   ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1, res2,
@@ -206,19 +206,17 @@ static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
   res1 = CLIP_SW_0_255(res1);
   res2 = CLIP_SW_0_255(res2);
   res3 = CLIP_SW_0_255(res3);
-  VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
-  VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
-  ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride);
 }
 
 static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
                                           int16_t *dequant_input, uint8_t *dest,
                                           int32_t dest_stride) {
   v16u8 dest0, dest1, dest2, dest3;
-  v8i16 in0, in1, in2, in3;
-  v8i16 mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
-  v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
-  v8i16 res0, res1, res2, res3;
+  v8i16 in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+  v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
   v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
   v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
   v16i8 zero = { 0 };
@@ -247,11 +245,8 @@ static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
              res2, res3);
   ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
   CLIP_SH4_0_255(res0, res1, res2, res3);
-  PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1, res2,
-              res3);
-  PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
-  PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
-  ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SW(res1, res0, res3, res2, vt0l, vt1l);
+  ST8x4_UB(vt0l, vt1l, dest, dest_stride);
 
   __asm__ __volatile__(
       "sw   $zero,    0(%[input])  \n\t"
@@ -276,10 +271,9 @@ static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
 
 static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
                                          uint8_t *dest, int32_t dest_stride) {
-  v8i16 input_dc0, input_dc1, vec;
+  v8i16 input_dc0, input_dc1, vec, res0, res1, res2, res3;
   v16u8 dest0, dest1, dest2, dest3;
   v16i8 zero = { 0 };
-  v8i16 res0, res1, res2, res3;
 
   input_dc0 = __msa_fill_h(input[0] * dequant_input[0]);
   input_dc1 = __msa_fill_h(input[16] * dequant_input[0]);
@@ -292,11 +286,8 @@ static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
              res2, res3);
   ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
   CLIP_SH4_0_255(res0, res1, res2, res3);
-  PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1, res2,
-              res3);
-  PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
-  PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
-  ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SH(res1, res0, res3, res2, res0, res1);
+  ST8x4_UB(res0, res1, dest, dest_stride);
 }
 
 void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c
index f5f1790ef53..98a4fc09a35 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c
@@ -24,208 +24,145 @@
     mask = ((v16u8)mask <= b_limit);                          \
   }
 
-#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out, \
-                           mask_in, hev_in)                            \
-  {                                                                    \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                          \
-    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;            \
-                                                                       \
-    p1_m = (v16i8)__msa_xori_b(p1_in_out, 0x80);                       \
-    p0_m = (v16i8)__msa_xori_b(p0_in_out, 0x80);                       \
-    q0_m = (v16i8)__msa_xori_b(q0_in_out, 0x80);                       \
-    q1_m = (v16i8)__msa_xori_b(q1_in_out, 0x80);                       \
-                                                                       \
-    filt = __msa_subs_s_b(p1_m, q1_m);                                 \
-                                                                       \
-    filt = filt & (v16i8)hev_in;                                       \
-                                                                       \
-    q0_sub_p0 = q0_m - p0_m;                                           \
-    filt_sign = __msa_clti_s_b(filt, 0);                               \
-                                                                       \
-    cnst3h = __msa_ldi_h(3);                                           \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);           \
-    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);   \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                     \
-    filt_r += q0_sub_p0_r;                                             \
-    filt_r = __msa_sat_s_h(filt_r, 7);                                 \
-                                                                       \
-    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);           \
-    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);   \
-    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                     \
-    filt_l += q0_sub_p0_l;                                             \
-    filt_l = __msa_sat_s_h(filt_l, 7);                                 \
-                                                                       \
-    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                \
-    filt = filt & (v16i8)mask_in;                                      \
-                                                                       \
-    cnst4b = __msa_ldi_b(4);                                           \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                              \
-    filt1 >>= 3;                                                       \
-                                                                       \
-    cnst3b = __msa_ldi_b(3);                                           \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                              \
-    filt2 >>= 3;                                                       \
-                                                                       \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                                \
-    q0_in_out = __msa_xori_b((v16u8)q0_m, 0x80);                       \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                                \
-    p0_in_out = __msa_xori_b((v16u8)p0_m, 0x80);                       \
-                                                                       \
-    filt = __msa_srari_b(filt1, 1);                                    \
-    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                        \
-    filt = filt & (v16i8)hev_in;                                       \
-                                                                       \
-    q1_m = __msa_subs_s_b(q1_m, filt);                                 \
-    q1_in_out = __msa_xori_b((v16u8)q1_m, 0x80);                       \
-    p1_m = __msa_adds_s_b(p1_m, filt);                                 \
-    p1_in_out = __msa_xori_b((v16u8)p1_m, 0x80);                       \
+#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev)      \
+  {                                                        \
+    v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+    const v16i8 cnst4b = __msa_ldi_b(4);                   \
+    const v16i8 cnst3b = __msa_ldi_b(3);                   \
+                                                           \
+    p1_m = (v16i8)__msa_xori_b(p1, 0x80);                  \
+    p0_m = (v16i8)__msa_xori_b(p0, 0x80);                  \
+    q0_m = (v16i8)__msa_xori_b(q0, 0x80);                  \
+    q1_m = (v16i8)__msa_xori_b(q1, 0x80);                  \
+                                                           \
+    filt = __msa_subs_s_b(p1_m, q1_m);                     \
+    filt &= hev;                                           \
+    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);                \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                \
+    filt &= mask;                                          \
+    t1 = __msa_adds_s_b(filt, cnst4b);                     \
+    t1 >>= cnst3b;                                         \
+    t2 = __msa_adds_s_b(filt, cnst3b);                     \
+    t2 >>= cnst3b;                                         \
+    q0_m = __msa_subs_s_b(q0_m, t1);                       \
+    q0 = __msa_xori_b((v16u8)q0_m, 0x80);                  \
+    p0_m = __msa_adds_s_b(p0_m, t2);                       \
+    p0 = __msa_xori_b((v16u8)p0_m, 0x80);                  \
+    filt = __msa_srari_b(t1, 1);                           \
+    hev = __msa_xori_b(hev, 0xff);                         \
+    filt &= hev;                                           \
+    q1_m = __msa_subs_s_b(q1_m, filt);                     \
+    q1 = __msa_xori_b((v16u8)q1_m, 0x80);                  \
+    p1_m = __msa_adds_s_b(p1_m, filt);                     \
+    p1 = __msa_xori_b((v16u8)p1_m, 0x80);                  \
   }
 
-#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask)         \
-  {                                                               \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign;      \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign;          \
-    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;       \
-                                                                  \
-    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                      \
-    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                      \
-    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                      \
-    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                      \
-                                                                  \
-    filt = __msa_subs_s_b(p1_m, q1_m);                            \
-                                                                  \
-    q0_sub_p0 = q0_m - p0_m;                                      \
-    filt_sign = __msa_clti_s_b(filt, 0);                          \
-                                                                  \
-    cnst3h = __msa_ldi_h(3);                                      \
-    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \
-    q0_sub_p0_r *= cnst3h;                                        \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                \
-    filt_r += q0_sub_p0_r;                                        \
-    filt_r = __msa_sat_s_h(filt_r, 7);                            \
-                                                                  \
-    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \
-    q0_sub_p0_l *= cnst3h;                                        \
-    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                \
-    filt_l += q0_sub_p0_l;                                        \
-    filt_l = __msa_sat_s_h(filt_l, 7);                            \
-                                                                  \
-    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);           \
-    filt = filt & (v16i8)(mask);                                  \
-                                                                  \
-    cnst4b = __msa_ldi_b(4);                                      \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                         \
-    filt1 >>= 3;                                                  \
-                                                                  \
-    cnst3b = __msa_ldi_b(3);                                      \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                         \
-    filt2 >>= 3;                                                  \
-                                                                  \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                           \
-    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);                      \
-    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);                      \
+#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \
+  {                                                       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2;     \
+    v16i8 q0_sub_p0;                                      \
+    const v16i8 cnst4b = __msa_ldi_b(4);                  \
+    const v16i8 cnst3b = __msa_ldi_b(3);                  \
+                                                          \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);              \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);              \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);              \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);              \
+                                                          \
+    filt = __msa_subs_s_b(p1_m, q1_m);                    \
+    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);               \
+    filt &= mask;                                         \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                 \
+    filt1 >>= cnst3b;                                     \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                 \
+    filt2 >>= cnst3b;                                     \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                   \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                   \
+    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);              \
+    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);              \
   }
 
-#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)           \
-  {                                                               \
-    v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                     \
-    v16i8 filt, q0_sub_p0, cnst4b, cnst3b;                        \
-    v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign;             \
-    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l;     \
-    v8i16 cnst3h, cnst27h, cnst18h, cnst63h;                      \
-                                                                  \
-    cnst3h = __msa_ldi_h(3);                                      \
-                                                                  \
-    p2_m = (v16i8)__msa_xori_b(p2, 0x80);                         \
-    p1_m = (v16i8)__msa_xori_b(p1, 0x80);                         \
-    p0_m = (v16i8)__msa_xori_b(p0, 0x80);                         \
-    q0_m = (v16i8)__msa_xori_b(q0, 0x80);                         \
-    q1_m = (v16i8)__msa_xori_b(q1, 0x80);                         \
-    q2_m = (v16i8)__msa_xori_b(q2, 0x80);                         \
-                                                                  \
-    filt = __msa_subs_s_b(p1_m, q1_m);                            \
-    q0_sub_p0 = q0_m - p0_m;                                      \
-    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                \
-    filt_sign = __msa_clti_s_b(filt, 0);                          \
-                                                                  \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \
-    q0_sub_p0_r *= cnst3h;                                        \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                \
-    filt_r = filt_r + q0_sub_p0_r;                                \
-    filt_r = __msa_sat_s_h(filt_r, 7);                            \
-                                                                  \
-    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \
-    q0_sub_p0_l *= cnst3h;                                        \
-    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                \
-    filt_l = filt_l + q0_sub_p0_l;                                \
-    filt_l = __msa_sat_s_h(filt_l, 7);                            \
-                                                                  \
-    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);           \
-    filt = filt & (v16i8)mask;                                    \
-    filt2 = filt & (v16i8)hev;                                    \
-                                                                  \
-    hev = __msa_xori_b(hev, 0xff);                                \
-    filt = filt & (v16i8)hev;                                     \
-    cnst4b = __msa_ldi_b(4);                                      \
-    filt1 = __msa_adds_s_b(filt2, cnst4b);                        \
-    filt1 >>= 3;                                                  \
-    cnst3b = __msa_ldi_b(3);                                      \
-    filt2 = __msa_adds_s_b(filt2, cnst3b);                        \
-    filt2 >>= 3;                                                  \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                           \
-                                                                  \
-    filt_sign = __msa_clti_s_b(filt, 0);                          \
-    ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);                 \
-                                                                  \
-    cnst27h = __msa_ldi_h(27);                                    \
-    cnst63h = __msa_ldi_h(63);                                    \
-                                                                  \
-    u_r = filt_r * cnst27h;                                       \
-    u_r += cnst63h;                                               \
-    u_r >>= 7;                                                    \
-    u_r = __msa_sat_s_h(u_r, 7);                                  \
-    u_l = filt_l * cnst27h;                                       \
-    u_l += cnst63h;                                               \
-    u_l >>= 7;                                                    \
-    u_l = __msa_sat_s_h(u_l, 7);                                  \
-    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                    \
-    q0_m = __msa_subs_s_b(q0_m, u);                               \
-    q0 = __msa_xori_b((v16u8)q0_m, 0x80);                         \
-    p0_m = __msa_adds_s_b(p0_m, u);                               \
-    p0 = __msa_xori_b((v16u8)p0_m, 0x80);                         \
-    cnst18h = __msa_ldi_h(18);                                    \
-    u_r = filt_r * cnst18h;                                       \
-    u_r += cnst63h;                                               \
-    u_r >>= 7;                                                    \
-    u_r = __msa_sat_s_h(u_r, 7);                                  \
-                                                                  \
-    u_l = filt_l * cnst18h;                                       \
-    u_l += cnst63h;                                               \
-    u_l >>= 7;                                                    \
-    u_l = __msa_sat_s_h(u_l, 7);                                  \
-    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                    \
-    q1_m = __msa_subs_s_b(q1_m, u);                               \
-    q1 = __msa_xori_b((v16u8)q1_m, 0x80);                         \
-    p1_m = __msa_adds_s_b(p1_m, u);                               \
-    p1 = __msa_xori_b((v16u8)p1_m, 0x80);                         \
-    u_r = filt_r << 3;                                            \
-    u_r += filt_r + cnst63h;                                      \
-    u_r >>= 7;                                                    \
-    u_r = __msa_sat_s_h(u_r, 7);                                  \
-                                                                  \
-    u_l = filt_l << 3;                                            \
-    u_l += filt_l + cnst63h;                                      \
-    u_l >>= 7;                                                    \
-    u_l = __msa_sat_s_h(u_l, 7);                                  \
-    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                    \
-    q2_m = __msa_subs_s_b(q2_m, u);                               \
-    q2 = __msa_xori_b((v16u8)q2_m, 0x80);                         \
-    p2_m = __msa_adds_s_b(p2_m, u);                               \
-    p2 = __msa_xori_b((v16u8)p2_m, 0x80);                         \
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
+  {                                                     \
+    v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;           \
+    v16i8 u, filt, t1, t2, filt_sign, q0_sub_p0;        \
+    v8i16 filt_r, filt_l, u_r, u_l;                     \
+    v8i16 temp0, temp1, temp2, temp3;                   \
+    const v16i8 cnst4b = __msa_ldi_b(4);                \
+    const v16i8 cnst3b = __msa_ldi_b(3);                \
+    const v8i16 cnst9h = __msa_ldi_h(9);                \
+    const v8i16 cnst63h = __msa_ldi_h(63);              \
+                                                        \
+    p2_m = (v16i8)__msa_xori_b(p2, 0x80);               \
+    p1_m = (v16i8)__msa_xori_b(p1, 0x80);               \
+    p0_m = (v16i8)__msa_xori_b(p0, 0x80);               \
+    q0_m = (v16i8)__msa_xori_b(q0, 0x80);               \
+    q1_m = (v16i8)__msa_xori_b(q1, 0x80);               \
+    q2_m = (v16i8)__msa_xori_b(q2, 0x80);               \
+                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                  \
+    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);             \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);             \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);             \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);             \
+    filt &= mask;                                       \
+                                                        \
+    t2 = filt & hev;                                    \
+    hev = __msa_xori_b(hev, 0xff);                      \
+    filt &= hev;                                        \
+    t1 = __msa_adds_s_b(t2, cnst4b);                    \
+    t1 >>= cnst3b;                                      \
+    t2 = __msa_adds_s_b(t2, cnst3b);                    \
+    t2 >>= cnst3b;                                      \
+    q0_m = __msa_subs_s_b(q0_m, t1);                    \
+    p0_m = __msa_adds_s_b(p0_m, t2);                    \
+    filt_sign = __msa_clti_s_b(filt, 0);                \
+    ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);       \
+    temp0 = filt_r * cnst9h;                            \
+    temp1 = temp0 + cnst63h;                            \
+    temp2 = filt_l * cnst9h;                            \
+    temp3 = temp2 + cnst63h;                            \
+                                                        \
+    u_r = temp1 >> 7;                                   \
+    u_r = __msa_sat_s_h(u_r, 7);                        \
+    u_l = temp3 >> 7;                                   \
+    u_l = __msa_sat_s_h(u_l, 7);                        \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);          \
+    q2_m = __msa_subs_s_b(q2_m, u);                     \
+    p2_m = __msa_adds_s_b(p2_m, u);                     \
+    q2 = __msa_xori_b((v16u8)q2_m, 0x80);               \
+    p2 = __msa_xori_b((v16u8)p2_m, 0x80);               \
+                                                        \
+    temp1 += temp0;                                     \
+    temp3 += temp2;                                     \
+                                                        \
+    u_r = temp1 >> 7;                                   \
+    u_r = __msa_sat_s_h(u_r, 7);                        \
+    u_l = temp3 >> 7;                                   \
+    u_l = __msa_sat_s_h(u_l, 7);                        \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);          \
+    q1_m = __msa_subs_s_b(q1_m, u);                     \
+    p1_m = __msa_adds_s_b(p1_m, u);                     \
+    q1 = __msa_xori_b((v16u8)q1_m, 0x80);               \
+    p1 = __msa_xori_b((v16u8)p1_m, 0x80);               \
+                                                        \
+    temp1 += temp0;                                     \
+    temp3 += temp2;                                     \
+                                                        \
+    u_r = temp1 >> 7;                                   \
+    u_r = __msa_sat_s_h(u_r, 7);                        \
+    u_l = temp3 >> 7;                                   \
+    u_l = __msa_sat_s_h(u_l, 7);                        \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);          \
+    q0_m = __msa_subs_s_b(q0_m, u);                     \
+    p0_m = __msa_adds_s_b(p0_m, u);                     \
+    q0 = __msa_xori_b((v16u8)q0_m, 0x80);               \
+    p0 = __msa_xori_b((v16u8)p0_m, 0x80);               \
   }
 
 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
index 65905f6c027..6bec3adec39 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
@@ -1221,6 +1221,8 @@
   }
 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
 
 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
                  out2, out3)                                                \
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
index 43e3c29b509..72fba2ec56b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
@@ -110,6 +110,8 @@ typedef struct {
   int Sharpness;
   int cpu_used;
   unsigned int rc_max_intra_bitrate_pct;
+  /* percent of rate boost for golden frame in CBR mode. */
+  unsigned int gf_cbr_boost_pct;
   unsigned int screen_content_mode;
 
   /* mode ->
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
index 87560f28b1e..c5389594553 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
@@ -1467,6 +1467,12 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
   cpi->baseline_gf_interval =
       cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
 
+  // GF behavior for 1 pass CBR, used when error_resilience is off.
+  if (!cpi->oxcf.error_resilient_mode &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+      cpi->oxcf.Mode == MODE_REALTIME)
+    cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr;
+
 #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
   cpi->oxcf.token_partitions = 3;
 #endif
@@ -1766,9 +1772,13 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->mse_source_denoised = 0;
 
   /* Should we use the cyclic refresh method.
-   * Currently this is tied to error resilliant mode
+   * Currently there is no external control for this.
+   * Enable it for error_resilient_mode, or for 1 pass CBR mode.
    */
-  cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode;
+  cpi->cyclic_refresh_mode_enabled =
+      (cpi->oxcf.error_resilient_mode ||
+       (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+        cpi->oxcf.Mode <= 2));
   cpi->cyclic_refresh_mode_max_mbs_perframe =
       (cpi->common.mb_rows * cpi->common.mb_cols) / 7;
   if (cpi->oxcf.number_of_layers == 1) {
@@ -1781,6 +1791,23 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->cyclic_refresh_mode_index = 0;
   cpi->cyclic_refresh_q = 32;
 
+  // GF behavior for 1 pass CBR, used when error_resilience is off.
+  cpi->gf_update_onepass_cbr = 0;
+  cpi->gf_noboost_onepass_cbr = 0;
+  if (!cpi->oxcf.error_resilient_mode &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && cpi->oxcf.Mode <= 2) {
+    cpi->gf_update_onepass_cbr = 1;
+    cpi->gf_noboost_onepass_cbr = 1;
+    cpi->gf_interval_onepass_cbr =
+        cpi->cyclic_refresh_mode_max_mbs_perframe > 0
+            ? (2 * (cpi->common.mb_rows * cpi->common.mb_cols) /
+               cpi->cyclic_refresh_mode_max_mbs_perframe)
+            : 10;
+    cpi->gf_interval_onepass_cbr =
+        VPXMIN(40, VPXMAX(6, cpi->gf_interval_onepass_cbr));
+    cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr;
+  }
+
   if (cpi->cyclic_refresh_mode_enabled) {
     CHECK_MEM_ERROR(cpi->cyclic_refresh_map,
                     vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
@@ -3925,7 +3952,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #else
     /* transform / motion compensation build reconstruction frame */
     vp8_encode_frame(cpi);
-
     if (cpi->oxcf.screen_content_mode == 2) {
       if (vp8_drop_encodedframe_overshoot(cpi, Q)) return;
     }
@@ -4203,6 +4229,20 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     }
   } while (Loop == 1);
 
+#if defined(DROP_UNCODED_FRAMES)
+  /* if there are no coded macroblocks at all drop this frame */
+  if (cpi->common.MBs == cpi->mb.skip_true_count &&
+      (cpi->drop_frame_count & 7) != 7 && cm->frame_type != KEY_FRAME) {
+    cpi->common.current_video_frame++;
+    cpi->frames_since_key++;
+    cpi->drop_frame_count++;
+    // We advance the temporal pattern for dropped frames.
+    cpi->temporal_pattern_counter++;
+    return;
+  }
+  cpi->drop_frame_count = 0;
+#endif
+
 #if 0
     /* Experimental code for lagged and one pass
      * Update stats used for one pass GF selection
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
index 59ad5773a64..bfcc6457c19 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
@@ -413,6 +413,9 @@ typedef struct VP8_COMP {
 
   int drop_frames_allowed; /* Are we permitted to drop frames? */
   int drop_frame;          /* Drop this frame? */
+#if defined(DROP_UNCODED_FRAMES)
+  int drop_frame_count;
+#endif
 
   vp8_prob frame_coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                            [ENTROPY_NODES];
@@ -501,6 +504,11 @@ typedef struct VP8_COMP {
 
   int force_maxqp;
 
+  // GF update for 1 pass cbr.
+  int gf_update_onepass_cbr;
+  int gf_interval_onepass_cbr;
+  int gf_noboost_onepass_cbr;
+
 #if CONFIG_MULTITHREAD
   /* multithread data */
   int *mt_current_mb_col;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
index 4d6afc19b35..e89247ae4ae 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
@@ -885,61 +885,61 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
   /* Adjust target frame size for Golden Frames: */
   if (cpi->oxcf.error_resilient_mode == 0 &&
       (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame) {
-    int Q =
-        (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
-
-    int gf_frame_useage = 0; /* Golden frame useage since last GF */
-    int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
-                  cpi->recent_ref_frame_usage[LAST_FRAME] +
-                  cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                  cpi->recent_ref_frame_usage[ALTREF_FRAME];
-
-    int pct_gf_active = (100 * cpi->gf_active_count) /
-                        (cpi->common.mb_rows * cpi->common.mb_cols);
-
-    if (tot_mbs) {
-      gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                         cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
-                        100 / tot_mbs;
-    }
+    if (!cpi->gf_update_onepass_cbr) {
+      int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME]
+                                      : cpi->oxcf.fixed_q;
+
+      int gf_frame_useage = 0; /* Golden frame useage since last GF */
+      int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
+                    cpi->recent_ref_frame_usage[LAST_FRAME] +
+                    cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                    cpi->recent_ref_frame_usage[ALTREF_FRAME];
+
+      int pct_gf_active = (100 * cpi->gf_active_count) /
+                          (cpi->common.mb_rows * cpi->common.mb_cols);
+
+      if (tot_mbs) {
+        gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                           cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
+                          100 / tot_mbs;
+      }
 
-    if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
+      if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
 
-    /* Is a fixed manual GF frequency being used */
-    if (cpi->auto_gold) {
-      /* For one pass throw a GF if recent frame intra useage is
-       * low or the GF useage is high
-       */
-      if ((cpi->pass == 0) &&
-          (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) {
-        cpi->common.refresh_golden_frame = 1;
+      /* Is a fixed manual GF frequency being used */
+      if (cpi->auto_gold) {
+        /* For one pass throw a GF if recent frame intra useage is
+         * low or the GF useage is high
+         */
+        if ((cpi->pass == 0) &&
+            (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) {
+          cpi->common.refresh_golden_frame = 1;
 
-        /* Two pass GF descision */
-      } else if (cpi->pass == 2) {
-        cpi->common.refresh_golden_frame = 1;
+          /* Two pass GF descision */
+        } else if (cpi->pass == 2) {
+          cpi->common.refresh_golden_frame = 1;
+        }
       }
-    }
 
 #if 0
 
-        /* Debug stats */
-        if (0)
-        {
-            FILE *f;
+          /* Debug stats */
+          if (0) {
+              FILE *f;
 
-            f = fopen("gf_useaget.stt", "a");
-            fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
-                    cpi->common.current_video_frame,  cpi->gfu_boost, GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
-            fclose(f);
-        }
+              f = fopen("gf_useaget.stt", "a");
+              fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
+                      cpi->common.current_video_frame,  cpi->gfu_boost,
+                      GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
+              fclose(f);
+          }
 
 #endif
 
-    if (cpi->common.refresh_golden_frame == 1) {
+      if (cpi->common.refresh_golden_frame == 1) {
 #if 0
 
-            if (0)
-            {
+            if (0) {
                 FILE *f;
 
                 f = fopen("GFexit.stt", "a");
@@ -949,61 +949,76 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
 
 #endif
 
-      if (cpi->auto_adjust_gold_quantizer) {
-        calc_gf_params(cpi);
-      }
-
-      /* If we are using alternate ref instead of gf then do not apply the
-       * boost It will instead be applied to the altref update Jims
-       * modified boost
-       */
-      if (!cpi->source_alt_ref_active) {
-        if (cpi->oxcf.fixed_q < 0) {
-          if (cpi->pass == 2) {
-            /* The spend on the GF is defined in the two pass
-             * code for two pass encodes
-             */
-            cpi->this_frame_target = cpi->per_frame_bandwidth;
-          } else {
-            int Boost = cpi->last_boost;
-            int frames_in_section = cpi->frames_till_gf_update_due + 1;
-            int allocation_chunks = (frames_in_section * 100) + (Boost - 100);
-            int bits_in_section = cpi->inter_frame_target * frames_in_section;
-
-            /* Normalize Altboost and allocations chunck down to
-             * prevent overflow
-             */
-            while (Boost > 1000) {
-              Boost /= 2;
-              allocation_chunks /= 2;
-            }
+        if (cpi->auto_adjust_gold_quantizer) {
+          calc_gf_params(cpi);
+        }
 
-            /* Avoid loss of precision but avoid overflow */
-            if ((bits_in_section >> 7) > allocation_chunks) {
-              cpi->this_frame_target =
-                  Boost * (bits_in_section / allocation_chunks);
+        /* If we are using alternate ref instead of gf then do not apply the
+         * boost It will instead be applied to the altref update Jims
+         * modified boost
+         */
+        if (!cpi->source_alt_ref_active) {
+          if (cpi->oxcf.fixed_q < 0) {
+            if (cpi->pass == 2) {
+              /* The spend on the GF is defined in the two pass
+               * code for two pass encodes
+               */
+              cpi->this_frame_target = cpi->per_frame_bandwidth;
             } else {
-              cpi->this_frame_target =
-                  (Boost * bits_in_section) / allocation_chunks;
+              int Boost = cpi->last_boost;
+              int frames_in_section = cpi->frames_till_gf_update_due + 1;
+              int allocation_chunks = (frames_in_section * 100) + (Boost - 100);
+              int bits_in_section = cpi->inter_frame_target * frames_in_section;
+
+              /* Normalize Altboost and allocations chunck down to
+               * prevent overflow
+               */
+              while (Boost > 1000) {
+                Boost /= 2;
+                allocation_chunks /= 2;
+              }
+
+              /* Avoid loss of precision but avoid overflow */
+              if ((bits_in_section >> 7) > allocation_chunks) {
+                cpi->this_frame_target =
+                    Boost * (bits_in_section / allocation_chunks);
+              } else {
+                cpi->this_frame_target =
+                    (Boost * bits_in_section) / allocation_chunks;
+              }
             }
+          } else {
+            cpi->this_frame_target =
+                (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) *
+                 cpi->last_boost) /
+                100;
           }
         } else {
-          cpi->this_frame_target =
-              (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) *
-               cpi->last_boost) /
-              100;
+          /* If there is an active ARF at this location use the minimum
+           * bits on this frame even if it is a contructed arf.
+           * The active maximum quantizer insures that an appropriate
+           * number of bits will be spent if needed for contstructed ARFs.
+          */
+          cpi->this_frame_target = 0;
         }
 
+        cpi->current_gf_interval = cpi->frames_till_gf_update_due;
       }
-      /* If there is an active ARF at this location use the minimum
-       * bits on this frame even if it is a contructed arf.
-       * The active maximum quantizer insures that an appropriate
-       * number of bits will be spent if needed for contstructed ARFs.
-       */
-      else {
-        cpi->this_frame_target = 0;
+    } else {
+      // Special case for 1 pass CBR: fixed gf period.
+      // TODO(marpan): Adjust this boost/interval logic.
+      // If gf_cbr_boost_pct is small (below threshold) set the flag
+      // gf_noboost_onepass_cbr = 1, which forces the gf to use the same
+      // rate correction factor as last.
+      cpi->gf_noboost_onepass_cbr = (cpi->oxcf.gf_cbr_boost_pct <= 100);
+      cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr;
+      // Skip this update if the zero_mvcount is low.
+      if (cpi->zeromv_count > (cpi->common.MBs >> 1)) {
+        cpi->common.refresh_golden_frame = 1;
+        cpi->this_frame_target =
+            (cpi->this_frame_target * (100 + cpi->oxcf.gf_cbr_boost_pct)) / 100;
       }
-
+      cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
       cpi->current_gf_interval = cpi->frames_till_gf_update_due;
     }
   }
@@ -1025,8 +1040,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
   if (cpi->common.frame_type == KEY_FRAME) {
     rate_correction_factor = cpi->key_frame_rate_correction_factor;
   } else {
-    if (cpi->oxcf.number_of_layers == 1 && (cpi->common.refresh_alt_ref_frame ||
-                                            cpi->common.refresh_golden_frame)) {
+    if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr &&
+        (cpi->common.refresh_alt_ref_frame ||
+         cpi->common.refresh_golden_frame)) {
       rate_correction_factor = cpi->gf_rate_correction_factor;
     } else {
       rate_correction_factor = cpi->rate_correction_factor;
@@ -1102,8 +1118,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
   if (cpi->common.frame_type == KEY_FRAME) {
     cpi->key_frame_rate_correction_factor = rate_correction_factor;
   } else {
-    if (cpi->oxcf.number_of_layers == 1 && (cpi->common.refresh_alt_ref_frame ||
-                                            cpi->common.refresh_golden_frame)) {
+    if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr &&
+        (cpi->common.refresh_alt_ref_frame ||
+         cpi->common.refresh_golden_frame)) {
       cpi->gf_rate_correction_factor = rate_correction_factor;
     } else {
       cpi->rate_correction_factor = rate_correction_factor;
@@ -1118,7 +1135,6 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
     cpi->active_worst_quality = cpi->worst_quality;
     return cpi->worst_quality;
   }
-
   /* Reset Zbin OQ value */
   cpi->mb.zbin_over_quant = 0;
 
@@ -1128,10 +1144,12 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
     if (cpi->common.frame_type == KEY_FRAME) {
       Q = cpi->oxcf.key_q;
     } else if (cpi->oxcf.number_of_layers == 1 &&
-               cpi->common.refresh_alt_ref_frame) {
+               cpi->common.refresh_alt_ref_frame &&
+               !cpi->gf_noboost_onepass_cbr) {
       Q = cpi->oxcf.alt_q;
     } else if (cpi->oxcf.number_of_layers == 1 &&
-               cpi->common.refresh_golden_frame) {
+               cpi->common.refresh_golden_frame &&
+               !cpi->gf_noboost_onepass_cbr) {
       Q = cpi->oxcf.gold_q;
     }
   } else {
@@ -1145,7 +1163,7 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
     if (cpi->common.frame_type == KEY_FRAME) {
       correction_factor = cpi->key_frame_rate_correction_factor;
     } else {
-      if (cpi->oxcf.number_of_layers == 1 &&
+      if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr &&
           (cpi->common.refresh_alt_ref_frame ||
            cpi->common.refresh_golden_frame)) {
         correction_factor = cpi->gf_rate_correction_factor;
@@ -1199,6 +1217,7 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
       if (cpi->common.frame_type == KEY_FRAME) {
         zbin_oqmax = 0;
       } else if (cpi->oxcf.number_of_layers == 1 &&
+                 !cpi->gf_noboost_onepass_cbr &&
                  (cpi->common.refresh_alt_ref_frame ||
                   (cpi->common.refresh_golden_frame &&
                    !cpi->source_alt_ref_active))) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
index fac237eec02..f8475ed61da 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
@@ -40,6 +40,7 @@ struct vp8_extracfg {
   vp8e_tuning tuning;
   unsigned int cq_level; /* constrained quality level */
   unsigned int rc_max_intra_bitrate_pct;
+  unsigned int gf_cbr_boost_pct;
   unsigned int screen_content_mode;
 };
 
@@ -65,6 +66,7 @@ static struct vp8_extracfg default_extracfg = {
   0,  /* tuning*/
   10, /* cq_level */
   0,  /* rc_max_intra_bitrate_pct */
+  0,  /* gf_cbr_boost_pct */
   0,  /* screen_content_mode */
 };
 
@@ -315,6 +317,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
 
   oxcf->target_bandwidth = cfg.rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
+  oxcf->gf_cbr_boost_pct = vp8_cfg.gf_cbr_boost_pct;
 
   oxcf->best_allowed_q = cfg.rc_min_quantizer;
   oxcf->worst_allowed_q = cfg.rc_max_quantizer;
@@ -558,6 +561,13 @@ static vpx_codec_err_t set_rc_max_intra_bitrate_pct(vpx_codec_alg_priv_t *ctx,
   return update_extracfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(vpx_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.gf_cbr_boost_pct = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
                                                va_list args) {
   struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
@@ -1159,6 +1169,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
   { VP8E_SET_CQ_LEVEL, set_cq_level },
   { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct },
   { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode },
+  { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
   { -1, NULL },
 };
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c
index efcf2bf885b..a254e79d20e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c
@@ -52,14 +52,12 @@ int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
   if (i == int_fb_list->num_internal_frame_buffers) return -1;
 
   if (int_fb_list->int_fb[i].size < min_size) {
-    int_fb_list->int_fb[i].data =
-        (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size);
-    if (!int_fb_list->int_fb[i].data) return -1;
-
-    // This memset is needed for fixing valgrind error from C loop filter
+    vpx_free(int_fb_list->int_fb[i].data);
+    // The data must be zeroed to fix a valgrind error from the C loop filter
     // due to access uninitialized memory in frame border. It could be
-    // removed if border is totally removed.
-    memset(int_fb_list->int_fb[i].data, 0, min_size);
+    // skipped if border were totally removed.
+    int_fb_list->int_fb[i].data = (uint8_t *)vpx_calloc(1, min_size);
+    if (!int_fb_list->int_fb[i].data) return -1;
     int_fb_list->int_fb[i].size = min_size;
   }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
index c6a39f85ca0..e3a088e2870 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
@@ -331,8 +331,8 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
   // DC only DCT coefficient
   if (eob == 1) {
     vpx_highbd_idct8x8_1_add(input, dest, stride, bd);
-  } else if (eob <= 10) {
-    vpx_highbd_idct8x8_10_add(input, dest, stride, bd);
+  } else if (eob <= 12) {
+    vpx_highbd_idct8x8_12_add(input, dest, stride, bd);
   } else {
     vpx_highbd_idct8x8_64_add(input, dest, stride, bd);
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
index fafc6598393..abef0676396 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -137,6 +137,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 
   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_fdct8x8_quant ssse3/;
 } else {
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error avx2 msa sse2/;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
index fde0b7e318c..628d1c8d2bc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -1517,7 +1517,6 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
     return 0;
   }
 
-  tile_data->xd.error_info = &tile_data->error_info;
   tile_data->xd.corrupted = 0;
 
   do {
@@ -1529,6 +1528,8 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
                         &tile_data->error_info, &tile_data->bit_reader,
                         pbi->decrypt_cb, pbi->decrypt_state);
     vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff);
+    // init resets xd.error_info
+    tile_data->xd.error_info = &tile_data->error_info;
 
     for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
          mi_row += MI_BLOCK_SIZE) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c
index 4372ba0371d..1a4152436a2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c
@@ -770,6 +770,10 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
     int idx, idy;
     PREDICTION_MODE b_mode;
     int_mv best_sub8x8[2];
+    const uint32_t invalid_mv = 0x80008000;
+    // Initialize the 2nd element as even though it won't be used meaningfully
+    // if is_compound is false, copying/clamping it may trigger a MSan warning.
+    best_sub8x8[1].as_int = invalid_mv;
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int j = idy * 2 + idx;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
index 3f1c430f98d..49aea69ebd1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
@@ -80,8 +80,8 @@ static void prob_diff_update(const vpx_tree_index *tree,
     vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
 }
 
-static void write_selected_tx_size(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                   vpx_writer *w) {
+static void write_selected_tx_size(const VP9_COMMON *cm,
+                                   const MACROBLOCKD *const xd, vpx_writer *w) {
   TX_SIZE tx_size = xd->mi[0]->tx_size;
   BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
@@ -95,7 +95,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   }
 }
 
-static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *const xd,
                       int segment_id, const MODE_INFO *mi, vpx_writer *w) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
@@ -195,7 +195,7 @@ static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
 }
 
 // This function encodes the reference frame
-static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd,
                              vpx_writer *w) {
   const MODE_INFO *const mi = xd->mi[0];
   const int is_compound = has_second_ref(mi);
@@ -230,14 +230,16 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   }
 }
 
-static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
-                                vpx_writer *w) {
+static void pack_inter_mode_mvs(VP9_COMP *cpi, const MACROBLOCKD *const xd,
+                                const MB_MODE_INFO_EXT *const mbmi_ext,
+                                vpx_writer *w,
+                                unsigned int *const max_mv_magnitude,
+                                int interp_filter_selected[MAX_REF_FRAMES]
+                                                          [SWITCHABLE]) {
   VP9_COMMON *const cm = &cpi->common;
   const nmv_context *nmvc = &cm->fc->nmvc;
-  const MACROBLOCK *const x = &cpi->td.mb;
-  const MACROBLOCKD *const xd = &x->e_mbd;
   const struct segmentation *const seg = &cm->seg;
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const MODE_INFO *const mi = xd->mi[0];
   const PREDICTION_MODE mode = mi->mode;
   const int segment_id = mi->segment_id;
   const BLOCK_SIZE bsize = mi->sb_type;
@@ -299,7 +301,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
       vp9_write_token(w, vp9_switchable_interp_tree,
                       cm->fc->switchable_interp_prob[ctx],
                       &switchable_interp_encodings[mi->interp_filter]);
-      ++cpi->interp_filter_selected[0][mi->interp_filter];
+      ++interp_filter_selected[0][mi->interp_filter];
     } else {
       assert(mi->interp_filter == cm->interp_filter);
     }
@@ -317,7 +319,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
             for (ref = 0; ref < 1 + is_compound; ++ref)
               vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
                             &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv,
-                            nmvc, allow_hp);
+                            nmvc, allow_hp, max_mv_magnitude);
           }
         }
       }
@@ -326,16 +328,16 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
         for (ref = 0; ref < 1 + is_compound; ++ref)
           vp9_encode_mv(cpi, w, &mi->mv[ref].as_mv,
                         &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv, nmvc,
-                        allow_hp);
+                        allow_hp, max_mv_magnitude);
       }
     }
   }
 }
 
 static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                              MODE_INFO **mi_8x8, vpx_writer *w) {
+                              vpx_writer *w) {
   const struct segmentation *const seg = &cm->seg;
-  const MODE_INFO *const mi = mi_8x8[0];
+  const MODE_INFO *const mi = xd->mi[0];
   const MODE_INFO *const above_mi = xd->above_mi;
   const MODE_INFO *const left_mi = xd->left_mi;
   const BLOCK_SIZE bsize = mi->sb_type;
@@ -366,27 +368,29 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]);
 }
 
-static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
-                          vpx_writer *w, TOKENEXTRA **tok,
-                          const TOKENEXTRA *const tok_end, int mi_row,
-                          int mi_col) {
+static void write_modes_b(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                          const TileInfo *const tile, vpx_writer *w,
+                          TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+                          int mi_row, int mi_col,
+                          unsigned int *const max_mv_magnitude,
+                          int interp_filter_selected[MAX_REF_FRAMES]
+                                                    [SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const MB_MODE_INFO_EXT *const mbmi_ext =
+      cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
   MODE_INFO *m;
 
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
   m = xd->mi[0];
 
-  cpi->td.mb.mbmi_ext =
-      cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
-
   set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->sb_type],
                  mi_col, num_8x8_blocks_wide_lookup[m->sb_type], cm->mi_rows,
                  cm->mi_cols);
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cm, xd, xd->mi, w);
+    write_mb_modes_kf(cm, xd, w);
   } else {
-    pack_inter_mode_mvs(cpi, m, w);
+    pack_inter_mode_mvs(cpi, xd, mbmi_ext, w, max_mv_magnitude,
+                        interp_filter_selected);
   }
 
   assert(*tok < tok_end);
@@ -415,13 +419,14 @@ static void write_partition(const VP9_COMMON *const cm,
   }
 }
 
-static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
-                           vpx_writer *w, TOKENEXTRA **tok,
-                           const TOKENEXTRA *const tok_end, int mi_row,
-                           int mi_col, BLOCK_SIZE bsize) {
+static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, vpx_writer *w,
+                           TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+                           int mi_row, int mi_col, BLOCK_SIZE bsize,
+                           unsigned int *const max_mv_magnitude,
+                           int interp_filter_selected[MAX_REF_FRAMES]
+                                                     [SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-
   const int bsl = b_width_log2_lookup[bsize];
   const int bs = (1 << bsl) / 4;
   PARTITION_TYPE partition;
@@ -436,30 +441,37 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
   write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
   if (subsize < BLOCK_8X8) {
-    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+    write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+                  max_mv_magnitude, interp_filter_selected);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+                      max_mv_magnitude, interp_filter_selected);
         break;
       case PARTITION_HORZ:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+                      max_mv_magnitude, interp_filter_selected);
         if (mi_row + bs < cm->mi_rows)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+          write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col,
+                        max_mv_magnitude, interp_filter_selected);
         break;
       case PARTITION_VERT:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+                      max_mv_magnitude, interp_filter_selected);
         if (mi_col + bs < cm->mi_cols)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+          write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
+                        max_mv_magnitude, interp_filter_selected);
         break;
       case PARTITION_SPLIT:
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
-                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
-                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
-                       subsize);
+        write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize,
+                       max_mv_magnitude, interp_filter_selected);
+        write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
+                       subsize, max_mv_magnitude, interp_filter_selected);
+        write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col,
+                       subsize, max_mv_magnitude, interp_filter_selected);
+        write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+                       subsize, max_mv_magnitude, interp_filter_selected);
         break;
       default: assert(0);
     }
@@ -471,11 +483,13 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                        vpx_writer *w, TOKENEXTRA **tok,
-                        const TOKENEXTRA *const tok_end) {
+static void write_modes(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                        const TileInfo *const tile, vpx_writer *w,
+                        TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+                        unsigned int *const max_mv_magnitude,
+                        int interp_filter_selected[MAX_REF_FRAMES]
+                                                  [SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int mi_row, mi_col;
 
   set_partition_probs(cm, xd);
@@ -485,7 +499,8 @@ static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
+      write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+                     BLOCK_64X64, max_mv_magnitude, interp_filter_selected);
   }
 }
 
@@ -900,8 +915,128 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) {
   }
 }
 
+static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) {
+  MACROBLOCKD *const xd = &data->xd;
+  vpx_start_encode(&data->bit_writer, data->dest);
+  write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info,
+              &data->bit_writer, &data->tok, data->tok_end,
+              &data->max_mv_magnitude, data->interp_filter_selected);
+  assert(data->tok == data->tok_end);
+  vpx_stop_encode(&data->bit_writer);
+  return 1;
+}
+
+void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) {
+  if (cpi->vp9_bitstream_worker_data) {
+    int i;
+    for (i = 1; i < cpi->num_workers; ++i) {
+      vpx_free(cpi->vp9_bitstream_worker_data[i].dest);
+    }
+    vpx_free(cpi->vp9_bitstream_worker_data);
+    cpi->vp9_bitstream_worker_data = NULL;
+  }
+}
+
+static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
+  int i;
+  const size_t worker_data_size =
+      cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
+  cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size);
+  memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
+  if (!cpi->vp9_bitstream_worker_data) return 1;
+  for (i = 1; i < cpi->num_workers; ++i) {
+    cpi->vp9_bitstream_worker_data[i].dest_size =
+        cpi->oxcf.width * cpi->oxcf.height;
+    cpi->vp9_bitstream_worker_data[i].dest =
+        vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size);
+    if (!cpi->vp9_bitstream_worker_data[i].dest) return 1;
+  }
+  return 0;
+}
+
+static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int num_workers = cpi->num_workers;
+  size_t total_size = 0;
+  int tile_col = 0;
+
+  if (!cpi->vp9_bitstream_worker_data ||
+      cpi->vp9_bitstream_worker_data[1].dest_size >
+          (cpi->oxcf.width * cpi->oxcf.height)) {
+    vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+    if (encode_tiles_buffer_alloc(cpi)) return 0;
+  }
+
+  while (tile_col < tile_cols) {
+    int i, j;
+    for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
+      VPxWorker *const worker = &cpi->workers[i];
+      VP9BitstreamWorkerData *const data = &cpi->vp9_bitstream_worker_data[i];
+
+      // Populate the worker data.
+      data->xd = cpi->td.mb.e_mbd;
+      data->tile_idx = tile_col;
+      data->tok = cpi->tile_tok[0][tile_col];
+      data->tok_end = cpi->tile_tok[0][tile_col] + cpi->tok_count[0][tile_col];
+      data->max_mv_magnitude = cpi->max_mv_magnitude;
+      memset(data->interp_filter_selected, 0,
+             sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE);
+
+      // First thread can directly write into the output buffer.
+      if (i == 0) {
+        // If this worker happens to be for the last tile, then do not offset it
+        // by 4 for the tile size.
+        data->dest =
+            data_ptr + total_size + (tile_col == tile_cols - 1 ? 0 : 4);
+      }
+      worker->data1 = cpi;
+      worker->data2 = data;
+      worker->hook = (VPxWorkerHook)encode_tile_worker;
+      worker->had_error = 0;
+
+      if (i < num_workers - 1) {
+        winterface->launch(worker);
+      } else {
+        winterface->execute(worker);
+      }
+      ++tile_col;
+    }
+    for (j = 0; j < i; ++j) {
+      VPxWorker *const worker = &cpi->workers[j];
+      VP9BitstreamWorkerData *const data =
+          (VP9BitstreamWorkerData *)worker->data2;
+      uint32_t tile_size;
+      int k;
+
+      if (!winterface->sync(worker)) return 0;
+      tile_size = data->bit_writer.pos;
+
+      // Aggregate per-thread bitstream stats.
+      cpi->max_mv_magnitude =
+          VPXMAX(cpi->max_mv_magnitude, data->max_mv_magnitude);
+      for (k = 0; k < SWITCHABLE; ++k) {
+        cpi->interp_filter_selected[0][k] += data->interp_filter_selected[0][k];
+      }
+
+      // Prefix the size of the tile on all but the last.
+      if (tile_col != tile_cols || j < i - 1) {
+        mem_put_be32(data_ptr + total_size, tile_size);
+        total_size += 4;
+      }
+      if (j > 0) {
+        memcpy(data_ptr + total_size, data->dest, tile_size);
+      }
+      total_size += tile_size;
+    }
+  }
+  return total_size;
+}
+
 static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   vpx_writer residual_bc;
   int tile_row, tile_col;
   TOKENEXTRA *tok_end;
@@ -912,6 +1047,14 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   memset(cm->above_seg_context, 0,
          sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
 
+  // Encoding tiles in parallel is done only for realtime mode now. In other
+  // modes the speed up is insignificant and requires further testing to ensure
+  // that it does not make the overall process worse in any case.
+  if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 &&
+      tile_cols > 1) {
+    return encode_tiles_mt(cpi, data_ptr);
+  }
+
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       int tile_idx = tile_row * tile_cols + tile_col;
@@ -925,8 +1068,9 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
       else
         vpx_start_encode(&residual_bc, data_ptr + total_size);
 
-      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info, &residual_bc, &tok,
-                  tok_end);
+      write_modes(cpi, xd, &cpi->tile_data[tile_idx].tile_info, &residual_bc,
+                  &tok, tok_end, &cpi->max_mv_magnitude,
+                  cpi->interp_filter_selected);
       assert(tok == tok_end);
       vpx_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
@@ -938,7 +1082,6 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
       total_size += residual_bc.pos;
     }
   }
-
   return total_size;
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
index 8c97d37f77e..044a3bbc7bc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
@@ -17,8 +17,26 @@ extern "C" {
 
 #include "vp9/encoder/vp9_encoder.h"
 
+typedef struct VP9BitstreamWorkerData {
+  uint8_t *dest;
+  int dest_size;
+  TOKENEXTRA *tok;
+  TOKENEXTRA *tok_end;
+  vpx_writer bit_writer;
+  int tile_idx;
+  unsigned int max_mv_magnitude;
+  // The size of interp_filter_selected in VP9_COMP is actually
+  // MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do
+  // is increment the very first index (index 0) for the first dimension. Hence
+  // this is sufficient.
+  int interp_filter_selected[1][SWITCHABLE];
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+} VP9BitstreamWorkerData;
+
 int vp9_get_refresh_mask(VP9_COMP *cpi);
 
+void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi);
+
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
index 335faca82b1..3ab05375ff7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -795,7 +795,12 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   v16x16 vt2[16];
   int force_split[21];
   int avg_32x32;
+  int max_var_32x32 = 0;
+  int min_var_32x32 = INT_MAX;
+  int var_32x32;
   int avg_16x16[4];
+  int64_t threshold_4x4avg;
+  NOISE_LEVEL noise_level = kLow;
   uint8_t *s;
   const uint8_t *d;
   int sp;
@@ -829,6 +834,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     }
   }
 
+  threshold_4x4avg =
+      (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : thresholds[2] >> 1;
+
   memset(x->variance_low, 0, sizeof(x->variance_low));
 
   if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
@@ -846,7 +854,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     // that the temporal reference frame will always be of type LAST_FRAME.
     // TODO(marpan): If that assumption is broken, we need to revisit this code.
     MODE_INFO *mi = xd->mi[0];
-    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+    YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
 
     const YV12_BUFFER_CONFIG *yv12_g = NULL;
     unsigned int y_sad_g, y_sad_thr;
@@ -871,9 +879,18 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       y_sad_g = UINT_MAX;
     }
 
-    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
-                         &cm->frame_refs[LAST_FRAME - 1].sf);
-    mi->ref_frame[0] = LAST_FRAME;
+    if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+        cpi->rc.is_src_frame_alt_ref) {
+      yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME);
+      vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                           &cm->frame_refs[ALTREF_FRAME - 1].sf);
+      mi->ref_frame[0] = ALTREF_FRAME;
+      y_sad_g = UINT_MAX;
+    } else {
+      vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                           &cm->frame_refs[LAST_FRAME - 1].sf);
+      mi->ref_frame[0] = LAST_FRAME;
+    }
     mi->ref_frame[1] = NONE;
     mi->sb_type = BLOCK_64X64;
     mi->mv[0].as_int = 0;
@@ -986,7 +1003,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       }
       if (is_key_frame || (low_res &&
                            vt.split[i].split[j].part_variances.none.variance >
-                               (thresholds[1] << 1))) {
+                               threshold_4x4avg)) {
         force_split[split_index] = 0;
         // Go down to 4x4 down-sampling for variance.
         variance4x4downsample[i2 + j] = 1;
@@ -1029,6 +1046,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     // (64x64) level.
     if (!force_split[i + 1]) {
       get_variance(&vt.split[i].part_variances.none);
+      var_32x32 = vt.split[i].part_variances.none.variance;
+      max_var_32x32 = VPXMAX(var_32x32, max_var_32x32);
+      min_var_32x32 = VPXMIN(var_32x32, min_var_32x32);
       if (vt.split[i].part_variances.none.variance > thresholds[1] ||
           (!is_key_frame &&
            vt.split[i].part_variances.none.variance > (thresholds[1] >> 1) &&
@@ -1036,15 +1056,27 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
         force_split[i + 1] = 1;
         force_split[0] = 1;
       }
-      avg_32x32 += vt.split[i].part_variances.none.variance;
+      avg_32x32 += var_32x32;
     }
   }
   if (!force_split[0]) {
     fill_variance_tree(&vt, BLOCK_64X64);
     get_variance(&vt.part_variances.none);
+    if (cpi->noise_estimate.enabled)
+      noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate);
     // If variance of this 64x64 block is above (some threshold of) the average
     // variance over the sub-32x32 blocks, then force this block to split.
-    if (!is_key_frame && vt.part_variances.none.variance > (5 * avg_32x32) >> 4)
+    // Only checking this for noise level >= medium for now.
+    if (!is_key_frame && noise_level >= kMedium &&
+        vt.part_variances.none.variance > (5 * avg_32x32) >> 4)
+      force_split[0] = 1;
+    // Else if the maximum 32x32 variance minus the miniumum 32x32 variance in
+    // a 64x64 block is greater than threshold and the maximum 32x32 variance is
+    // above a miniumum threshold, then force the split of a 64x64 block
+    // Only check this for low noise.
+    else if (!is_key_frame && noise_level < kMedium &&
+             (max_var_32x32 - min_var_32x32) > 3 * (thresholds[0] >> 3) &&
+             max_var_32x32 > thresholds[0] >> 1)
       force_split[0] = 1;
   }
 
@@ -1863,7 +1895,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
     }
   }
 
-  if (cm->use_prev_frame_mvs ||
+  if (cm->use_prev_frame_mvs || !cm->error_resilient_mode ||
       (cpi->svc.use_base_mv && cpi->svc.number_spatial_layers > 1 &&
        cpi->svc.spatial_layer_id != cpi->svc.number_spatial_layers - 1)) {
     MV_REF *const frame_mvs =
@@ -3942,8 +3974,10 @@ void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row,
   int mi_row;
 
   // Set up pointers to per thread motion search counters.
-  td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
-  td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+  this_tile->m_search_count = 0;   // Count of motion search hits.
+  this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
+  td->mb.m_search_count_ptr = &this_tile->m_search_count;
+  td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
 
   for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) {
     if (cpi->sf.use_nonrd_pick_mode)
@@ -4048,6 +4082,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     vp9_zero(x->zcoeff_blk);
 
     if (cm->frame_type != KEY_FRAME && cpi->rc.frames_since_golden == 0 &&
+        !(cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) &&
         !cpi->use_svc)
       cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
index 874a8e4b981..023d087c2ce 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
@@ -208,7 +208,8 @@ void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
 }
 
 void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref,
-                   const nmv_context *mvctx, int usehp) {
+                   const nmv_context *mvctx, int usehp,
+                   unsigned int *const max_mv_magnitude) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
   usehp = usehp && use_mv_hp(ref);
@@ -223,8 +224,8 @@ void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref,
   // If auto_mv_step_size is enabled then keep track of the largest
   // motion vector component used.
   if (cpi->sf.mv.auto_mv_step_size) {
-    unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3;
-    cpi->max_mv_magnitude = VPXMAX(maxv, cpi->max_mv_magnitude);
+    const unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3;
+    *max_mv_magnitude = VPXMAX(maxv, *max_mv_magnitude);
   }
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
index ad77b8154f3..9fc7ab8dc45 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
@@ -23,7 +23,8 @@ void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
                          nmv_context_counts *const counts);
 
 void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref,
-                   const nmv_context *mvctx, int usehp);
+                   const nmv_context *mvctx, int usehp,
+                   unsigned int *const max_mv_magnitude);
 
 void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context *mvctx, int usehp);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
index 12f02e7c5d9..2a58003829c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
@@ -2030,7 +2030,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   vpx_free(cpi->tile_thr_data);
   vpx_free(cpi->workers);
 
-  if (cpi->num_workers > 1) vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+  if (cpi->num_workers > 1) {
+    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+    vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+  }
 
   vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);
 
@@ -2438,6 +2441,8 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
       cpi->resize_pending = 1;
       return 1;
     }
+    // Force recode if projected_frame_size > max_frame_bandwidth
+    if (rc->projected_frame_size >= rc->max_frame_bandwidth) return 1;
 
     // TODO(agrange) high_limit could be greater than the scale-down threshold.
     if ((rc->projected_frame_size > high_limit && q < maxq) ||
@@ -2796,7 +2801,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
     dc_quant_devisor = 4.0;
 #endif
 
-    fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
+    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
        "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
        "%10"PRId64" %10"PRId64" %10d "
        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
@@ -2805,8 +2810,6 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
         "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
-        cpi->td.rd_counts.m_search_count,
-        cpi->td.rd_counts.ex_search_count,
         cpi->rc.source_alt_ref_pending,
         cpi->rc.source_alt_ref_active,
         cpi->rc.this_frame_target,
@@ -3124,7 +3127,8 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   if (cpi->oxcf.pass == 0 && cpi->oxcf.mode == REALTIME &&
       cpi->oxcf.speed >= 5 && cpi->resize_state == 0 &&
       (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
-       cpi->oxcf.rc_mode == VPX_VBR))
+       cpi->oxcf.rc_mode == VPX_VBR) &&
+      cm->show_frame)
     vp9_avg_source_sad(cpi);
 
   // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference
@@ -3214,6 +3218,13 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   vpx_clear_system_state();
 }
 
+#define MAX_QSTEP_ADJ 4
+static int get_qstep_adj(int rate_excess, int rate_limit) {
+  int qstep =
+      rate_limit ? ((rate_excess + rate_limit / 2) / rate_limit) : INT_MAX;
+  return VPXMIN(qstep, MAX_QSTEP_ADJ);
+}
+
 static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
                                     uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
@@ -3387,6 +3398,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
         // to attempt to recode.
         int last_q = q;
         int retries = 0;
+        int qstep;
 
         if (cpi->resize_pending == 1) {
           // Change in frame size so go back around the recode loop.
@@ -3412,7 +3424,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
             q_high = rc->worst_quality;
 
           // Raise Qlow as to at least the current value
-          q_low = q < q_high ? q + 1 : q_high;
+          qstep =
+              get_qstep_adj(rc->projected_frame_size, rc->this_frame_target);
+          q_low = VPXMIN(q + qstep, q_high);
+          // q_low = q < q_high ? q + 1 : q_high;
 
           if (undershoot_seen || loop_at_this_size > 1) {
             // Update rate_correction_factor unless
@@ -3437,7 +3452,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
           overshoot_seen = 1;
         } else {
           // Frame is too small
-          q_high = q > q_low ? q - 1 : q_low;
+          qstep =
+              get_qstep_adj(rc->this_frame_target, rc->projected_frame_size);
+          q_high = VPXMAX(q - qstep, q_low);
+          // q_high = q > q_low ? q - 1 : q_low;
 
           if (overshoot_seen || loop_at_this_size > 1) {
             vp9_rc_update_rate_correction_factors(cpi);
@@ -4477,7 +4495,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
 #endif
 
-      if ((oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0)) {
+      if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) &&
+          (oxcf->arnr_strength > 0)) {
         int bitrate = cpi->rc.avg_frame_bandwidth / 40;
         int not_low_bitrate = bitrate > ALT_REF_AQ_LOW_BITRATE_BOUNDARY;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
index 66e41492b57..0007e6395da 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
@@ -267,14 +267,14 @@ typedef struct TileDataEnc {
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
   int mode_map[BLOCK_SIZES][MAX_MODES];
+  int m_search_count;
+  int ex_search_count;
 } TileDataEnc;
 
 typedef struct RD_COUNTS {
   vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
   int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-  int m_search_count;
-  int ex_search_count;
 } RD_COUNTS;
 
 typedef struct ThreadData {
@@ -601,6 +601,7 @@ typedef struct VP9_COMP {
   VPxWorker *workers;
   struct EncWorkerData *tile_thr_data;
   VP9LfSync lf_row_sync;
+  struct VP9BitstreamWorkerData *vp9_bitstream_worker_data;
 
   int keep_level_stats;
   Vp9LevelInfo level_info;
@@ -735,7 +736,8 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
 }
 
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
-  return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
+  return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) &&
+         cpi->oxcf.lag_in_frames > 0 &&
          (cpi->oxcf.enable_auto_arf &&
           (!is_two_pass_svc(cpi) ||
            cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c
index 7657573bbf0..f4f7c7baccd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c
@@ -30,10 +30,6 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
             for (n = 0; n < ENTROPY_TOKENS; n++)
               td->rd_counts.coef_counts[i][j][k][l][m][n] +=
                   td_t->rd_counts.coef_counts[i][j][k][l][m][n];
-
-  // Counts of all motion searches and exhuastive mesh searches.
-  td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
-  td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
 }
 
 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
index 2f1fe360d85..788952d3467 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
@@ -48,10 +48,8 @@
 #define FIRST_PASS_Q 10.0
 #define GF_MAX_BOOST 96.0
 #define INTRA_MODE_PENALTY 1024
-#define KF_MAX_BOOST 128.0
 #define MIN_ARF_GF_BOOST 240
 #define MIN_DECAY_FACTOR 0.01
-#define MIN_KF_BOOST 300
 #define NEW_MV_MODE_PENALTY 32
 #define SVC_FACTOR_PT_LOW 0.45
 #define DARK_THRESH 64
@@ -1578,7 +1576,7 @@ static double get_sr_decay_rate(const VP9_COMP *cpi,
     sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part -
                (INTRA_PART * modified_pcnt_intra);
   }
-  return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+  return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT);
 }
 
 // This function gives an estimate of how badly we believe the prediction
@@ -1681,6 +1679,7 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
 
 #define BASELINE_ERR_PER_MB 1000.0
 static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
+                               double *sr_accumulator,
                                double this_frame_mv_in_out, double max_boost) {
   double frame_boost;
   const double lq = vp9_convert_qindex_to_q(
@@ -1694,17 +1693,56 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
 
   // Underlying boost factor is based on inter error ratio.
   frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
-                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
+
+  // Update the accumulator for second ref error difference.
+  // This is intended to give an indication of how much the coded error is
+  // increasing over time.
+  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+  *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
+
+  // Small adjustment for cases where there is a zoom out
+  if (this_frame_mv_in_out > 0.0)
+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+
+  // Q correction and scalling
   frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
 
-  // Increase boost for frames where new data coming into frame (e.g. zoom out).
-  // Slightly reduce boost if there is a net balance of motion out of the frame
-  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+  return VPXMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+#define KF_BOOST_FACTOR 12.5
+static double calc_kf_frame_boost(VP9_COMP *cpi,
+                                  const FIRSTPASS_STATS *this_frame,
+                                  double *sr_accumulator,
+                                  double this_frame_mv_in_out,
+                                  double max_boost) {
+  double frame_boost;
+  const double lq = vp9_convert_qindex_to_q(
+      cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
+  const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
+  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                       : cpi->common.MBs;
+
+  // Correct for any inactive region in the image
+  num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
+
+  // Update the accumulator for second ref error difference.
+  // This is intended to give an indication of how much the coded error is
+  // increasing over time.
+  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+  *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
+
+  // Small adjustment for cases where there is a zoom out
   if (this_frame_mv_in_out > 0.0)
     frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
-  // In the extreme case the boost is halved.
-  else
-    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+  // Q correction and scalling
+  frame_boost = frame_boost * KF_BOOST_FACTOR * boost_q_correction;
 
   return VPXMIN(frame_boost, max_boost * boost_q_correction);
 }
@@ -1719,6 +1757,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
+  double sr_accumulator = 0.0;
   int arf_boost;
   int flash_detected = 0;
 
@@ -1745,9 +1784,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
                               : decay_accumulator;
     }
 
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+    sr_accumulator = 0.0;
+    boost_score += decay_accumulator *
+                   calc_frame_boost(cpi, this_frame, &sr_accumulator,
+                                    this_frame_mv_in_out, GF_MAX_BOOST);
   }
 
   *f_boost = (int)boost_score;
@@ -1759,6 +1799,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
   this_frame_mv_in_out = 0.0;
   mv_in_out_accumulator = 0.0;
   abs_mv_in_out_accumulator = 0.0;
+  sr_accumulator = 0.0;
 
   // Search backward towards last gf position.
   for (i = -1; i >= -b_frames; --i) {
@@ -1783,9 +1824,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
                               : decay_accumulator;
     }
 
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+    sr_accumulator = 0.0;
+    boost_score += decay_accumulator *
+                   calc_frame_boost(cpi, this_frame, &sr_accumulator,
+                                    this_frame_mv_in_out, GF_MAX_BOOST);
   }
   *b_boost = (int)boost_score;
 
@@ -2085,7 +2127,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double mv_ratio_accumulator = 0.0;
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
-
   double loop_decay_rate = 1.00;
   double last_loop_decay_rate = 1.00;
 
@@ -2095,6 +2136,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double mv_ratio_accumulator_thresh;
   double mv_in_out_thresh;
   double abs_mv_in_out_thresh;
+  double sr_accumulator = 0.0;
   unsigned int allow_alt_ref = is_altref_enabled(cpi);
 
   int f_boost = 0;
@@ -2221,9 +2263,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     // Calculate a boost number for this frame.
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+    sr_accumulator = 0.0;
+    boost_score += decay_accumulator *
+                   calc_frame_boost(cpi, &next_frame, &sr_accumulator,
+                                    this_frame_mv_in_out, GF_MAX_BOOST);
 
     // Break out conditions.
     if (
@@ -2473,6 +2516,10 @@ static int test_candidate_kf(TWO_PASS *twopass,
 }
 
 #define FRAMES_TO_CHECK_DECAY 8
+#define KF_MAX_FRAME_BOOST 96.0
+#define MIN_KF_TOT_BOOST 300
+#define MAX_KF_TOT_BOOST 5400
+#define KF_BOOST_SCAN_MAX_FRAMES 32
 
 static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int i, j;
@@ -2485,14 +2532,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   FIRSTPASS_STATS next_frame;
   FIRSTPASS_STATS last_frame;
   int kf_bits = 0;
-  int loop_decay_counter = 0;
   double decay_accumulator = 1.0;
-  double av_decay_accumulator = 0.0;
   double zero_motion_accumulator = 1.0;
   double boost_score = 0.0;
   double kf_mod_err = 0.0;
   double kf_group_err = 0.0;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+  double sr_accumulator = 0.0;
 
   vp9_zero(next_frame);
 
@@ -2642,34 +2688,36 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Scan through the kf group collating various stats used to determine
   // how many bits to spend on it.
-  decay_accumulator = 1.0;
   boost_score = 0.0;
+
   for (i = 0; i < (rc->frames_to_key - 1); ++i) {
     if (EOF == input_stats(twopass, &next_frame)) break;
 
-    // Monitor for static sections.
-    zero_motion_accumulator = VPXMIN(zero_motion_accumulator,
-                                     get_zero_motion_factor(cpi, &next_frame));
-
-    // Not all frames in the group are necessarily used in calculating boost.
-    if ((i <= rc->max_gf_interval) ||
-        ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
-      const double frame_boost =
-          calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST);
-
-      // How fast is prediction quality decaying.
-      if (!detect_flash(twopass, 0)) {
-        const double loop_decay_rate =
-            get_prediction_decay_rate(cpi, &next_frame);
-        decay_accumulator *= loop_decay_rate;
-        decay_accumulator = VPXMAX(decay_accumulator, MIN_DECAY_FACTOR);
-        av_decay_accumulator += decay_accumulator;
-        ++loop_decay_counter;
-      }
-      boost_score += (decay_accumulator * frame_boost);
+    if (i <= KF_BOOST_SCAN_MAX_FRAMES) {
+      double frame_boost;
+      double zm_factor;
+
+      // Monitor for static sections.
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+
+      // Factor 0.75-1.25 based on how much of frame is static.
+      zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
+
+      // The second (lagging) ref error is not valid immediately after
+      // a key frame because either the lag has not built up (in the case of
+      // the first key frame or it points to a refernce before the new key
+      // frame.
+      if (i < 2) sr_accumulator = 0.0;
+      frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0,
+                                        KF_MAX_FRAME_BOOST * zm_factor);
+
+      boost_score += frame_boost;
+      if (frame_boost < 25.00) break;
+    } else {
+      break;
     }
   }
-  av_decay_accumulator /= (double)loop_decay_counter;
 
   reset_fpf_position(twopass, start_position);
 
@@ -2681,9 +2729,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       start_position, twopass->stats_in_end, rc->frames_to_key);
 
   // Apply various clamps for min and max boost
-  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
-  rc->kf_boost = VPXMAX(rc->kf_boost, (rc->frames_to_key * 3));
-  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_BOOST);
+  rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+  rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
 
   // Work out how many bits to allocate for the key frame itself.
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
index 76d2611d89f..2b7ddbcd948 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
@@ -1080,12 +1080,14 @@ typedef struct {
   PREDICTION_MODE pred_mode;
 } REF_MODE;
 
-#define RT_INTER_MODES 8
+#define RT_INTER_MODES 12
 static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
   { LAST_FRAME, ZEROMV },   { LAST_FRAME, NEARESTMV },
   { GOLDEN_FRAME, ZEROMV }, { LAST_FRAME, NEARMV },
   { LAST_FRAME, NEWMV },    { GOLDEN_FRAME, NEARESTMV },
-  { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, NEWMV }
+  { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, NEWMV },
+  { ALTREF_FRAME, ZEROMV }, { ALTREF_FRAME, NEARESTMV },
+  { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV }
 };
 static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = {
   { LAST_FRAME, ZEROMV },      { GOLDEN_FRAME, ZEROMV },
@@ -1467,6 +1469,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     usable_ref_frame = GOLDEN_FRAME;
   }
 
+  if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+      (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref))
+    usable_ref_frame = ALTREF_FRAME;
+
   // For svc mode, on spatial_layer_id > 0: if the reference has different scale
   // constrain the inter mode to only test zero motion.
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
@@ -1506,7 +1512,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     int this_early_term = 0;
     PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
 
-    if (cpi->use_svc) this_mode = ref_mode_set_svc[idx].pred_mode;
+    ref_frame = ref_mode_set[idx].ref_frame;
+
+    if (cpi->use_svc) {
+      this_mode = ref_mode_set_svc[idx].pred_mode;
+      ref_frame = ref_mode_set_svc[idx].ref_frame;
+    }
+    if (ref_frame > usable_ref_frame) continue;
 
     if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
         this_mode != NEARESTMV) {
@@ -1515,9 +1527,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
     if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue;
 
-    ref_frame = ref_mode_set[idx].ref_frame;
-    if (cpi->use_svc) {
-      ref_frame = ref_mode_set_svc[idx].ref_frame;
+    if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) {
+      if (cpi->rc.is_src_frame_alt_ref &&
+          (ref_frame != ALTREF_FRAME ||
+           frame_mv[this_mode][ref_frame].as_int != 0))
+        continue;
+
+      if (cpi->rc.alt_ref_gf_group &&
+          cpi->rc.frames_since_golden > (cpi->rc.baseline_gf_interval >> 1) &&
+          ref_frame == GOLDEN_FRAME &&
+          frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
+
+      if (cpi->rc.alt_ref_gf_group &&
+          cpi->rc.frames_since_golden < (cpi->rc.baseline_gf_interval >> 1) &&
+          ref_frame == ALTREF_FRAME &&
+          frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
     }
 
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
@@ -1543,13 +1569,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         continue;
     }
 
-    if (!force_skip_low_temp_var &&
+    if (sf->reference_masking &&
         !(frame_mv[this_mode][ref_frame].as_int == 0 &&
           ref_frame == LAST_FRAME)) {
-      i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-      if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
-        if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+      if (usable_ref_frame < ALTREF_FRAME) {
+        if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
+          i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+          if ((cpi->ref_frame_flags & flag_list[i]))
+            if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+              ref_frame_skip_mask |= (1 << ref_frame);
+        }
+      } else if (!cpi->rc.is_src_frame_alt_ref &&
+                 !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+                   ref_frame == ALTREF_FRAME)) {
+        int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
+        int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
+        if (((cpi->ref_frame_flags & flag_list[ref1]) &&
+             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
+            ((cpi->ref_frame_flags & flag_list[ref2]) &&
+             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
           ref_frame_skip_mask |= (1 << ref_frame);
+      }
     }
     if (ref_frame_skip_mask & (1 << ref_frame)) continue;
 
@@ -1884,6 +1924,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
          svc_force_zero_mode[best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
   }
+  if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+      cpi->rc.is_src_frame_alt_ref)
+    perform_intra_pred = 0;
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
   if ((!force_skip_low_temp_var || bsize < BLOCK_32X32) && perform_intra_pred &&
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
index 93eddd655ac..b5cfd5de6c6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -45,6 +45,9 @@
 
 #define FRAME_OVERHEAD_BITS 200
 
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS 0
+
 #if CONFIG_VP9_HIGHBITDEPTH
 #define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
   do {                                                       \
@@ -327,6 +330,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->prev_avg_source_sad_lag = 0;
   rc->high_source_sad = 0;
   rc->high_source_sad_lagindex = -1;
+  rc->alt_ref_gf_group = 0;
   rc->fac_active_worst_inter = 150;
   rc->fac_active_worst_gf = 100;
   rc->force_qpmin = 0;
@@ -561,6 +565,13 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
     q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
               VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
   }
+#if USE_ALTREF_FOR_ONE_PASS
+  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+      cpi->oxcf.lag_in_frames > 0 && cpi->rc.is_src_frame_alt_ref &&
+      !cpi->rc.alt_ref_gf_group) {
+    q = VPXMIN(q, (q + cpi->rc.last_boosted_qindex) >> 1);
+  }
+#endif
   return q;
 }
 
@@ -1429,24 +1440,16 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
   cpi->rc.rc_1_frame = 0;
 }
 
-// Use this macro to turn on/off use of alt-refs in one-pass mode.
-#define USE_ALTREF_FOR_ONE_PASS 1
-
 static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
-  int target;
   const int af_ratio = rc->af_ratio_onepass_vbr;
-#if USE_ALTREF_FOR_ONE_PASS
-  target =
+  int target =
       (!rc->is_src_frame_alt_ref &&
        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))
           ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
                 (rc->baseline_gf_interval + af_ratio - 1)
           : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
                 (rc->baseline_gf_interval + af_ratio - 1);
-#else
-  target = rc->avg_frame_bandwidth;
-#endif
   return vp9_rc_clamp_pframe_target_size(cpi, target);
 }
 
@@ -1499,8 +1502,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) {
       vp9_cyclic_refresh_set_golden_update(cpi);
     } else {
-      rc->baseline_gf_interval =
-          (rc->min_gf_interval + rc->max_gf_interval) / 2;
+      rc->baseline_gf_interval = VPXMIN(
+          20, VPXMAX(10, (rc->min_gf_interval + rc->max_gf_interval) / 2));
     }
     rc->af_ratio_onepass_vbr = 10;
     if (rc->rolling_target_bits > 0)
@@ -1526,6 +1529,7 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+    rc->alt_ref_gf_group = USE_ALTREF_FOR_ONE_PASS;
   }
   if (cm->frame_type == KEY_FRAME)
     target = calc_iframe_target_size_one_pass_vbr(cpi);
@@ -2088,8 +2092,8 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
     rc->high_source_sad_lagindex = high_source_sad_lagindex;
   // Adjust some factors for the next GF group, ignore initial key frame,
   // and only for lag_in_frames not too small.
-  if (cpi->refresh_golden_frame == 1 && cm->frame_type != KEY_FRAME &&
-      cm->current_video_frame > 30 && cpi->oxcf.lag_in_frames > 8) {
+  if (cpi->refresh_golden_frame == 1 && cm->current_video_frame > 30 &&
+      cpi->oxcf.lag_in_frames > 8) {
     int frame_constraint;
     if (rc->rolling_target_bits > 0)
       rate_err =
@@ -2110,6 +2114,8 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
                                      ? VPXMAX(10, rc->baseline_gf_interval >> 1)
                                      : VPXMAX(6, rc->baseline_gf_interval >> 1);
     }
+    if (rc->baseline_gf_interval > cpi->oxcf.lag_in_frames - 1)
+      rc->baseline_gf_interval = cpi->oxcf.lag_in_frames - 1;
     // Check for constraining gf_interval for up-coming scene/content changes,
     // or for up-coming key frame, whichever is closer.
     frame_constraint = rc->frames_to_key;
@@ -2133,6 +2139,23 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
       rc->af_ratio_onepass_vbr = 5;
       rc->gfu_boost = DEFAULT_GF_BOOST >> 2;
     }
+#if USE_ALTREF_FOR_ONE_PASS
+    // Don't use alt-ref if there is a scene cut within the group,
+    // or content is not low.
+    if ((rc->high_source_sad_lagindex > 0 &&
+         rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) ||
+        (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) {
+      rc->source_alt_ref_pending = 0;
+      rc->alt_ref_gf_group = 0;
+    } else {
+      rc->source_alt_ref_pending = 1;
+      rc->alt_ref_gf_group = 1;
+      // If alt-ref is used for this gf group, limit the interval.
+      if (rc->baseline_gf_interval > 10 &&
+          rc->baseline_gf_interval < rc->frames_to_key)
+        rc->baseline_gf_interval = 10;
+    }
+#endif
     target = calc_pframe_target_size_one_pass_vbr(cpi);
     vp9_rc_set_frame_target(cpi, target);
   }
@@ -2261,6 +2284,7 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
         cpi->ext_refresh_frame_flags_pending == 0) {
       int target;
       cpi->refresh_golden_frame = 1;
+      rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
       rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
       rc->baseline_gf_interval =
           VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval));
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
index 6006e9b051a..70aef03ffb4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -160,6 +160,7 @@ typedef struct {
   uint64_t avg_source_sad[MAX_LAG_BUFFERS];
   uint64_t prev_avg_source_sad_lag;
   int high_source_sad_lagindex;
+  int alt_ref_gf_group;
   int high_source_sad;
   int count_last_scene_change;
   int avg_frame_low_motion;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
index ea893609193..3e1ed50a6d2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
@@ -421,6 +421,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
         (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
     sf->max_delta_qindex = is_keyframe ? 20 : 15;
     sf->partition_search_type = REFERENCE_PARTITION;
+    if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
+        cpi->rc.is_src_frame_alt_ref) {
+      sf->partition_search_type = VAR_BASED_PARTITION;
+    }
     sf->use_nonrd_pick_mode = 1;
     sf->allow_skip_recode = 0;
     sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
@@ -504,7 +508,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
       sf->short_circuit_low_temp_var = 2;
     }
     sf->limit_newmv_early_exit = 0;
-    sf->bias_golden = 0;
   }
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
index fb2a9254172..b3c3d7beb9e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -12,14 +12,17 @@
 #include <tmmintrin.h>  // SSSE3
 
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/fdct.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
 void vp9_fdct8x8_quant_ssse3(
-    const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs,
+    const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs,
     int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr,
     const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-    int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
     uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
   __m128i zero;
   int pass;
@@ -328,15 +331,15 @@ void vp9_fdct8x8_quant_ssse3(
         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -398,20 +401,21 @@ void vp9_fdct8x8_quant_ssse3(
           qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
           qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
           coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
           coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
         } else {
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+          // Maybe a more efficient way to store 0?
+          store_zero_tran_low(qcoeff_ptr + n_coeffs);
+          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
 
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+          store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
         }
       }
 
@@ -452,10 +456,10 @@ void vp9_fdct8x8_quant_ssse3(
     }
   } else {
     do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
       n_coeffs += 8 * 2;
     } while (n_coeffs < 0);
     *eob_ptr = 0;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c
index 3b5dc3ddac0..0a3e84a0da2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c
@@ -467,8 +467,8 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
     // as the size of the first intra frame be better? This will
     // avoid too many deallocate and allocate.
     if (frame_worker_data->scratch_buffer_size < data_sz) {
-      frame_worker_data->scratch_buffer =
-          (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz);
+      vpx_free(frame_worker_data->scratch_buffer);
+      frame_worker_data->scratch_buffer = (uint8_t *)vpx_malloc(data_sz);
       if (frame_worker_data->scratch_buffer == NULL) {
         set_error_detail(ctx, "Failed to reallocate scratch buffer");
         return VPX_CODEC_MEM_ERROR;
@@ -553,6 +553,9 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                    ctx->decrypt_cb, ctx->decrypt_state);
   if (res != VPX_CODEC_OK) return res;
 
+  if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1)
+    frame_count = ctx->svc_spatial_layer + 1;
+
   if (ctx->frame_parallel_decode) {
     // Decode in frame parallel mode. When decoding in this mode, the frame
     // passed to the decoder must be either a normal frame or a superframe with
@@ -1001,6 +1004,16 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->svc_decoding = 1;
+  ctx->svc_spatial_layer = va_arg(args, int);
+  if (ctx->svc_spatial_layer < 0)
+    return VPX_CODEC_INVALID_PARAM;
+  else
+    return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VP8_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -1011,6 +1024,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VPXD_SET_DECRYPTOR, ctrl_set_decryptor },
   { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
   { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
+  { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc },
 
   // Getters
   { VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.h b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.h
index cc3d51842ac..c1559599b8c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.h
@@ -60,6 +60,10 @@ struct vpx_codec_alg_priv {
   void *ext_priv;  // Private data associated with the external frame buffers.
   vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
   vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+
+  // Allow for decoding up to a given spatial layer for SVC stream.
+  int svc_decoding;
+  int svc_spatial_layer;
 };
 
 #endif  // VP9_VP9_DX_IFACE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
index 5aa0b8ddb84..88b1531d8c4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
@@ -53,6 +53,10 @@ static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = { 4, 5, 7, 11,
 static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = { 16, 16, 16,
                                                                   16, 16 };
 
+static const int DEFAULT_SCALE_FACTORS_NUM_2x[VPX_SS_MAX_LAYERS] = { 1, 2, 4 };
+
+static const int DEFAULT_SCALE_FACTORS_DEN_2x[VPX_SS_MAX_LAYERS] = { 4, 4, 4 };
+
 typedef enum {
   QUANTIZER = 0,
   BITRATE,
@@ -156,6 +160,9 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
   char *token;
   const char *delim = ",";
   char *save_ptr;
+  int num_layers = svc_ctx->spatial_layers;
+  if (type == BITRATE)
+    num_layers = svc_ctx->spatial_layers * svc_ctx->temporal_layers;
 
   if (input == NULL || option0 == NULL ||
       (option1 == NULL && type == SCALE_FACTOR))
@@ -163,7 +170,7 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
 
   input_string = strdup(input);
   token = strtok_r(input_string, delim, &save_ptr);
-  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+  for (i = 0; i < num_layers; ++i) {
     if (token != NULL) {
       res = extract_option(type, token, option0 + i, option1 + i);
       if (res != VPX_CODEC_OK) break;
@@ -172,11 +179,11 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
       break;
     }
   }
-  if (res == VPX_CODEC_OK && i != svc_ctx->spatial_layers) {
+  if (res == VPX_CODEC_OK && i != num_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
             "svc: layer params type: %d    %d values required, "
             "but only %d specified\n",
-            type, svc_ctx->spatial_layers, i);
+            type, num_layers, i);
     res = VPX_CODEC_INVALID_PARAM;
   }
   free(input_string);
@@ -287,24 +294,30 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
   return VPX_CODEC_OK;
 }
 
-void assign_layer_bitrates(const SvcContext *svc_ctx,
-                           vpx_codec_enc_cfg_t *const enc_cfg) {
+vpx_codec_err_t assign_layer_bitrates(const SvcContext *svc_ctx,
+                                      vpx_codec_enc_cfg_t *const enc_cfg) {
   int i;
   const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
   int sl, tl, spatial_layer_target;
 
   if (svc_ctx->temporal_layering_mode != 0) {
     if (si->bitrates[0] != 0) {
-      enc_cfg->rc_target_bitrate = 0;
+      unsigned int total_bitrate = 0;
       for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
-        enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] = 0;
+        total_bitrate += si->bitrates[sl * svc_ctx->temporal_layers +
+                                      svc_ctx->temporal_layers - 1];
         for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
           enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] +=
               (unsigned int)si->bitrates[sl * svc_ctx->temporal_layers + tl];
           enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + tl] =
               si->bitrates[sl * svc_ctx->temporal_layers + tl];
+          if (tl > 0 && (si->bitrates[sl * svc_ctx->temporal_layers + tl] <=
+                         si->bitrates[sl * svc_ctx->temporal_layers + tl - 1]))
+            return VPX_CODEC_INVALID_PARAM;
         }
       }
+      if (total_bitrate != enc_cfg->rc_target_bitrate)
+        return VPX_CODEC_INVALID_PARAM;
     } else {
       float total = 0;
       float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
@@ -341,11 +354,14 @@ void assign_layer_bitrates(const SvcContext *svc_ctx,
     }
   } else {
     if (si->bitrates[0] != 0) {
-      enc_cfg->rc_target_bitrate = 0;
+      unsigned int total_bitrate = 0;
       for (i = 0; i < svc_ctx->spatial_layers; ++i) {
         enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i];
-        enc_cfg->rc_target_bitrate += si->bitrates[i];
+        enc_cfg->layer_target_bitrate[i] = (unsigned int)si->bitrates[i];
+        total_bitrate += si->bitrates[i];
       }
+      if (total_bitrate != enc_cfg->rc_target_bitrate)
+        return VPX_CODEC_INVALID_PARAM;
     } else {
       float total = 0;
       float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
@@ -368,6 +384,7 @@ void assign_layer_bitrates(const SvcContext *svc_ctx,
       }
     }
   }
+  return VPX_CODEC_OK;
 }
 
 vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
@@ -412,12 +429,24 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl];
     si->svc_params.speed_per_layer[sl] = svc_ctx->speed;
   }
-
+  if (enc_cfg->rc_end_usage == VPX_CBR && enc_cfg->g_pass == VPX_RC_ONE_PASS &&
+      svc_ctx->spatial_layers <= 3) {
+    for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+      int sl2 = (svc_ctx->spatial_layers == 2) ? sl + 1 : sl;
+      si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2];
+      si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2];
+    }
+  }
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
       i = sl * svc_ctx->temporal_layers + tl;
       si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
       si->svc_params.min_quantizers[i] = 0;
+      if (enc_cfg->rc_end_usage == VPX_CBR &&
+          enc_cfg->g_pass == VPX_RC_ONE_PASS) {
+        si->svc_params.max_quantizers[i] = 56;
+        si->svc_params.min_quantizers[i] = 2;
+      }
     }
   }
 
@@ -442,7 +471,15 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
             (int)VPX_MAX_LAYERS);
     return VPX_CODEC_INVALID_PARAM;
   }
-  assign_layer_bitrates(svc_ctx, enc_cfg);
+  res = assign_layer_bitrates(svc_ctx, enc_cfg);
+  if (res != VPX_CODEC_OK) {
+    svc_log(svc_ctx, SVC_LOG_ERROR,
+            "layer bitrates incorrect: \n"
+            "1) spatial layer bitrates should sum up to target \n"
+            "2) temporal layer bitrates should be increasing within \n"
+            "a spatial layer \n");
+    return VPX_CODEC_INVALID_PARAM;
+  }
 
 #if CONFIG_SPATIAL_SVC
   for (i = 0; i < svc_ctx->spatial_layers; ++i)
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h b/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h
index c8bde5832a5..462785075cb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h
@@ -54,7 +54,7 @@ typedef struct SvcInternal {
   // values extracted from option, quantizers
   vpx_svc_extra_cfg_t svc_params;
   int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
-  int bitrates[VPX_SS_MAX_LAYERS];
+  int bitrates[VPX_MAX_LAYERS];
 
   // accumulated statistics
   double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];  // total/Y/U/V
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
index 8fa25e8bc07..cc90159bc3a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
@@ -561,7 +561,22 @@ enum vp8e_enc_control_id {
    *
    * Supported in codecs: VP9
    */
-  VP9E_SET_ALT_REF_AQ
+  VP9E_SET_ALT_REF_AQ,
+
+  /*!\brief Boost percentage for Golden Frame in CBR mode.
+    *
+    * This value controls the amount of boost given to Golden Frame in
+    * CBR mode. It is expressed as a percentage of the average
+    * per-frame bitrate, with the special (and default) value 0 meaning
+    * the feature is off, i.e., no golden frame boost in CBR mode and
+    * average bitrate target is used.
+    *
+    * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+    * than average frame, set this to 100.
+    *
+    * Supported in codecs: VP8
+    */
+  VP8E_SET_GF_CBR_BOOST_PCT,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -769,6 +784,9 @@ VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
 #define VPX_CTRL_VP8E_SET_MAX_INTER_BITRATE_PCT
 
+VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT
+
 VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
 #define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h
index 88204acd378..0d7759eb25b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h
@@ -111,6 +111,11 @@ enum vp8_dec_control_id {
    */
   VP9_SET_SKIP_LOOP_FILTER,
 
+  /** control function to decode SVC stream up to the x spatial layers,
+   * where x is passed in through the control, and is 0 for base layer.
+   */
+  VP9_DECODE_SVC_SPATIAL_LAYER,
+
   VP8_DECODER_CTRL_ID_MAX
 };
 
@@ -162,6 +167,8 @@ VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
 #define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
+#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
+VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
 
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
index 7cb2ba90d2f..e9503f13d70 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
@@ -52,10 +52,10 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
     v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
     v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
     v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
     {
       const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
       const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c
new file mode 100644
index 00000000000..5530c6425b2
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c
@@ -0,0 +1,761 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static INLINE void load_thresh(const uint8_t *blimit, const uint8_t *limit,
+                               const uint8_t *thresh, uint16x8_t *blimit_vec,
+                               uint16x8_t *limit_vec, uint16x8_t *thresh_vec,
+                               const int bd) {
+  const int16x8_t shift = vdupq_n_s16(bd - 8);
+  *blimit_vec = vmovl_u8(vld1_dup_u8(blimit));
+  *limit_vec = vmovl_u8(vld1_dup_u8(limit));
+  *thresh_vec = vmovl_u8(vld1_dup_u8(thresh));
+  *blimit_vec = vshlq_u16(*blimit_vec, shift);
+  *limit_vec = vshlq_u16(*limit_vec, shift);
+  *thresh_vec = vshlq_u16(*thresh_vec, shift);
+}
+
+// Here flat is 128-bit long, with each 16-bit chunk being a mask of
+// a pixel. When used to control filter branches, we only detect whether it is
+// all 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
+// flat equals 0 if and only if flat_status equals 0.
+// flat equals -1 (all 1s) if and only if flat_status equals -4. (This is true
+// because each mask occupies more than 1 bit.)
+static INLINE uint32_t calc_flat_status(const uint16x8_t flat) {
+  const uint64x1_t t0 = vadd_u64(vreinterpret_u64_u16(vget_low_u16(flat)),
+                                 vreinterpret_u64_u16(vget_high_u16(flat)));
+  const uint64x1_t t1 = vpaddl_u32(vreinterpret_u32_u64(t0));
+  return vget_lane_u32(vreinterpret_u32_u64(t1), 0);
+}
+
+static INLINE uint16x8_t
+filter_hev_mask4(const uint16x8_t limit, const uint16x8_t blimit,
+                 const uint16x8_t thresh, const uint16x8_t p3,
+                 const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
+                 const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
+                 const uint16x8_t q3, uint16x8_t *hev, uint16x8_t *mask) {
+  uint16x8_t max, t0, t1;
+
+  max = vabdq_u16(p1, p0);
+  max = vmaxq_u16(max, vabdq_u16(q1, q0));
+  *hev = vcgtq_u16(max, thresh);
+  *mask = vmaxq_u16(max, vabdq_u16(p3, p2));
+  *mask = vmaxq_u16(*mask, vabdq_u16(p2, p1));
+  *mask = vmaxq_u16(*mask, vabdq_u16(q2, q1));
+  *mask = vmaxq_u16(*mask, vabdq_u16(q3, q2));
+  t0 = vabdq_u16(p0, q0);
+  t1 = vabdq_u16(p1, q1);
+  t0 = vaddq_u16(t0, t0);
+  t1 = vshrq_n_u16(t1, 1);
+  t0 = vaddq_u16(t0, t1);
+  *mask = vcleq_u16(*mask, limit);
+  t0 = vcleq_u16(t0, blimit);
+  *mask = vandq_u16(*mask, t0);
+
+  return max;
+}
+
+static INLINE uint16x8_t filter_flat_hev_mask(
+    const uint16x8_t limit, const uint16x8_t blimit, const uint16x8_t thresh,
+    const uint16x8_t p3, const uint16x8_t p2, const uint16x8_t p1,
+    const uint16x8_t p0, const uint16x8_t q0, const uint16x8_t q1,
+    const uint16x8_t q2, const uint16x8_t q3, uint16x8_t *flat,
+    uint32_t *flat_status, uint16x8_t *hev, const int bd) {
+  uint16x8_t mask;
+  const uint16x8_t max = filter_hev_mask4(limit, blimit, thresh, p3, p2, p1, p0,
+                                          q0, q1, q2, q3, hev, &mask);
+  *flat = vmaxq_u16(max, vabdq_u16(p2, p0));
+  *flat = vmaxq_u16(*flat, vabdq_u16(q2, q0));
+  *flat = vmaxq_u16(*flat, vabdq_u16(p3, p0));
+  *flat = vmaxq_u16(*flat, vabdq_u16(q3, q0));
+  *flat = vcleq_u16(*flat, vdupq_n_u16(1 << (bd - 8))); /* flat_mask4() */
+  *flat = vandq_u16(*flat, mask);
+  *flat_status = calc_flat_status(*flat);
+
+  return mask;
+}
+
+static INLINE uint16x8_t flat_mask5(const uint16x8_t p4, const uint16x8_t p3,
+                                    const uint16x8_t p2, const uint16x8_t p1,
+                                    const uint16x8_t p0, const uint16x8_t q0,
+                                    const uint16x8_t q1, const uint16x8_t q2,
+                                    const uint16x8_t q3, const uint16x8_t q4,
+                                    const uint16x8_t flat,
+                                    uint32_t *flat2_status, const int bd) {
+  uint16x8_t flat2 = vabdq_u16(p4, p0);
+  flat2 = vmaxq_u16(flat2, vabdq_u16(p3, p0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(p2, p0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(p1, p0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(q1, q0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(q2, q0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(q3, q0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(q4, q0));
+  flat2 = vcleq_u16(flat2, vdupq_n_u16(1 << (bd - 8)));
+  flat2 = vandq_u16(flat2, flat);
+  *flat2_status = calc_flat_status(flat2);
+
+  return flat2;
+}
+
+static INLINE int16x8_t flip_sign(const uint16x8_t v, const int bd) {
+  const uint16x8_t offset = vdupq_n_u16(0x80 << (bd - 8));
+  return vreinterpretq_s16_u16(vsubq_u16(v, offset));
+}
+
+static INLINE uint16x8_t flip_sign_back(const int16x8_t v, const int bd) {
+  const int16x8_t offset = vdupq_n_s16(0x80 << (bd - 8));
+  return vreinterpretq_u16_s16(vaddq_s16(v, offset));
+}
+
+static INLINE void filter_update(const uint16x8_t sub0, const uint16x8_t sub1,
+                                 const uint16x8_t add0, const uint16x8_t add1,
+                                 uint16x8_t *sum) {
+  *sum = vsubq_u16(*sum, sub0);
+  *sum = vsubq_u16(*sum, sub1);
+  *sum = vaddq_u16(*sum, add0);
+  *sum = vaddq_u16(*sum, add1);
+}
+
+static INLINE uint16x8_t calc_7_tap_filter_kernel(const uint16x8_t sub0,
+                                                  const uint16x8_t sub1,
+                                                  const uint16x8_t add0,
+                                                  const uint16x8_t add1,
+                                                  uint16x8_t *sum) {
+  filter_update(sub0, sub1, add0, add1, sum);
+  return vrshrq_n_u16(*sum, 3);
+}
+
+static INLINE uint16x8_t apply_15_tap_filter_kernel(
+    const uint16x8_t flat, const uint16x8_t sub0, const uint16x8_t sub1,
+    const uint16x8_t add0, const uint16x8_t add1, const uint16x8_t in,
+    uint16x8_t *sum) {
+  filter_update(sub0, sub1, add0, add1, sum);
+  return vbslq_u16(flat, vrshrq_n_u16(*sum, 4), in);
+}
+
+// 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+static INLINE void calc_7_tap_filter(const uint16x8_t p3, const uint16x8_t p2,
+                                     const uint16x8_t p1, const uint16x8_t p0,
+                                     const uint16x8_t q0, const uint16x8_t q1,
+                                     const uint16x8_t q2, const uint16x8_t q3,
+                                     uint16x8_t *op2, uint16x8_t *op1,
+                                     uint16x8_t *op0, uint16x8_t *oq0,
+                                     uint16x8_t *oq1, uint16x8_t *oq2) {
+  uint16x8_t sum;
+  sum = vaddq_u16(p3, p3);   // 2*p3
+  sum = vaddq_u16(sum, p3);  // 3*p3
+  sum = vaddq_u16(sum, p2);  // 3*p3+p2
+  sum = vaddq_u16(sum, p2);  // 3*p3+2*p2
+  sum = vaddq_u16(sum, p1);  // 3*p3+2*p2+p1
+  sum = vaddq_u16(sum, p0);  // 3*p3+2*p2+p1+p0
+  sum = vaddq_u16(sum, q0);  // 3*p3+2*p2+p1+p0+q0
+  *op2 = vrshrq_n_u16(sum, 3);
+  *op1 = calc_7_tap_filter_kernel(p3, p2, p1, q1, &sum);
+  *op0 = calc_7_tap_filter_kernel(p3, p1, p0, q2, &sum);
+  *oq0 = calc_7_tap_filter_kernel(p3, p0, q0, q3, &sum);
+  *oq1 = calc_7_tap_filter_kernel(p2, q0, q1, q3, &sum);
+  *oq2 = calc_7_tap_filter_kernel(p1, q1, q2, q3, &sum);
+}
+
+static INLINE void apply_7_tap_filter(const uint16x8_t flat,
+                                      const uint16x8_t p3, const uint16x8_t p2,
+                                      const uint16x8_t p1, const uint16x8_t p0,
+                                      const uint16x8_t q0, const uint16x8_t q1,
+                                      const uint16x8_t q2, const uint16x8_t q3,
+                                      uint16x8_t *op2, uint16x8_t *op1,
+                                      uint16x8_t *op0, uint16x8_t *oq0,
+                                      uint16x8_t *oq1, uint16x8_t *oq2) {
+  uint16x8_t tp1, tp0, tq0, tq1;
+  calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, &tq0, &tq1,
+                    oq2);
+  *op2 = vbslq_u16(flat, *op2, p2);
+  *op1 = vbslq_u16(flat, tp1, *op1);
+  *op0 = vbslq_u16(flat, tp0, *op0);
+  *oq0 = vbslq_u16(flat, tq0, *oq0);
+  *oq1 = vbslq_u16(flat, tq1, *oq1);
+  *oq2 = vbslq_u16(flat, *oq2, q2);
+}
+
+// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+static INLINE void apply_15_tap_filter(
+    const uint16x8_t flat2, const uint16x8_t p7, const uint16x8_t p6,
+    const uint16x8_t p5, const uint16x8_t p4, const uint16x8_t p3,
+    const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
+    const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
+    const uint16x8_t q3, const uint16x8_t q4, const uint16x8_t q5,
+    const uint16x8_t q6, const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5,
+    uint16x8_t *op4, uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1,
+    uint16x8_t *op0, uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
+    uint16x8_t *oq3, uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6) {
+  uint16x8_t sum;
+  sum = vshlq_n_u16(p7, 3);  // 8*p7
+  sum = vsubq_u16(sum, p7);  // 7*p7
+  sum = vaddq_u16(sum, p6);  // 7*p7+p6
+  sum = vaddq_u16(sum, p6);  // 7*p7+2*p6
+  sum = vaddq_u16(sum, p5);  // 7*p7+2*p6+p5
+  sum = vaddq_u16(sum, p4);  // 7*p7+2*p6+p5+p4
+  sum = vaddq_u16(sum, p3);  // 7*p7+2*p6+p5+p4+p3
+  sum = vaddq_u16(sum, p2);  // 7*p7+2*p6+p5+p4+p3+p2
+  sum = vaddq_u16(sum, p1);  // 7*p7+2*p6+p5+p4+p3+p2+p1
+  sum = vaddq_u16(sum, p0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+  sum = vaddq_u16(sum, q0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+  *op6 = vbslq_u16(flat2, vrshrq_n_u16(sum, 4), p6);
+  *op5 = apply_15_tap_filter_kernel(flat2, p7, p6, p5, q1, p5, &sum);
+  *op4 = apply_15_tap_filter_kernel(flat2, p7, p5, p4, q2, p4, &sum);
+  *op3 = apply_15_tap_filter_kernel(flat2, p7, p4, p3, q3, p3, &sum);
+  *op2 = apply_15_tap_filter_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
+  *op1 = apply_15_tap_filter_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
+  *op0 = apply_15_tap_filter_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
+  *oq0 = apply_15_tap_filter_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
+  *oq1 = apply_15_tap_filter_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
+  *oq2 = apply_15_tap_filter_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
+  *oq3 = apply_15_tap_filter_kernel(flat2, p4, q2, q3, q7, q3, &sum);
+  *oq4 = apply_15_tap_filter_kernel(flat2, p3, q3, q4, q7, q4, &sum);
+  *oq5 = apply_15_tap_filter_kernel(flat2, p2, q4, q5, q7, q5, &sum);
+  *oq6 = apply_15_tap_filter_kernel(flat2, p1, q5, q6, q7, q6, &sum);
+}
+
+static INLINE void filter4(const uint16x8_t mask, const uint16x8_t hev,
+                           const uint16x8_t p1, const uint16x8_t p0,
+                           const uint16x8_t q0, const uint16x8_t q1,
+                           uint16x8_t *op1, uint16x8_t *op0, uint16x8_t *oq0,
+                           uint16x8_t *oq1, const int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << (bd - 1)) - 1);
+  const int16x8_t min = vdupq_n_s16((int16_t)(((uint32_t)-1) << (bd - 1)));
+  int16x8_t filter, filter1, filter2, t;
+  int16x8_t ps1 = flip_sign(p1, bd);
+  int16x8_t ps0 = flip_sign(p0, bd);
+  int16x8_t qs0 = flip_sign(q0, bd);
+  int16x8_t qs1 = flip_sign(q1, bd);
+
+  /* add outer taps if we have high edge variance */
+  filter = vsubq_s16(ps1, qs1);
+  filter = vmaxq_s16(filter, min);
+  filter = vminq_s16(filter, max);
+  filter = vandq_s16(filter, vreinterpretq_s16_u16(hev));
+  t = vsubq_s16(qs0, ps0);
+
+  /* inner taps */
+  filter = vaddq_s16(filter, t);
+  filter = vaddq_s16(filter, t);
+  filter = vaddq_s16(filter, t);
+  filter = vmaxq_s16(filter, min);
+  filter = vminq_s16(filter, max);
+  filter = vandq_s16(filter, vreinterpretq_s16_u16(mask));
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  /* if it equals 4 we'll set it to adjust by -1 to account for the fact */
+  /* we'd round it by 3 the other way */
+  t = vaddq_s16(filter, vdupq_n_s16(4));
+  t = vminq_s16(t, max);
+  filter1 = vshrq_n_s16(t, 3);
+  t = vaddq_s16(filter, vdupq_n_s16(3));
+  t = vminq_s16(t, max);
+  filter2 = vshrq_n_s16(t, 3);
+
+  qs0 = vsubq_s16(qs0, filter1);
+  qs0 = vmaxq_s16(qs0, min);
+  qs0 = vminq_s16(qs0, max);
+  ps0 = vaddq_s16(ps0, filter2);
+  ps0 = vmaxq_s16(ps0, min);
+  ps0 = vminq_s16(ps0, max);
+  *oq0 = flip_sign_back(qs0, bd);
+  *op0 = flip_sign_back(ps0, bd);
+
+  /* outer tap adjustments */
+  filter = vrshrq_n_s16(filter1, 1);
+  filter = vbicq_s16(filter, vreinterpretq_s16_u16(hev));
+
+  qs1 = vsubq_s16(qs1, filter);
+  qs1 = vmaxq_s16(qs1, min);
+  qs1 = vminq_s16(qs1, max);
+  ps1 = vaddq_s16(ps1, filter);
+  ps1 = vmaxq_s16(ps1, min);
+  ps1 = vminq_s16(ps1, max);
+  *oq1 = flip_sign_back(qs1, bd);
+  *op1 = flip_sign_back(ps1, bd);
+}
+
+static INLINE void filter8(const uint16x8_t mask, const uint16x8_t flat,
+                           const uint32_t flat_status, const uint16x8_t hev,
+                           const uint16x8_t p3, const uint16x8_t p2,
+                           const uint16x8_t p1, const uint16x8_t p0,
+                           const uint16x8_t q0, const uint16x8_t q1,
+                           const uint16x8_t q2, const uint16x8_t q3,
+                           uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
+                           uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
+                           const int bd) {
+  if (flat_status != (uint32_t)-4) {
+    filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
+    *op2 = p2;
+    *oq2 = q2;
+    if (flat_status) {
+      apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
+                         oq0, oq1, oq2);
+    }
+  } else {
+    calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, oq0, oq1,
+                      oq2);
+  }
+}
+
+static INLINE void filter16(
+    const uint16x8_t mask, const uint16x8_t flat, const uint32_t flat_status,
+    const uint16x8_t flat2, const uint32_t flat2_status, const uint16x8_t hev,
+    const uint16x8_t p7, const uint16x8_t p6, const uint16x8_t p5,
+    const uint16x8_t p4, const uint16x8_t p3, const uint16x8_t p2,
+    const uint16x8_t p1, const uint16x8_t p0, const uint16x8_t q0,
+    const uint16x8_t q1, const uint16x8_t q2, const uint16x8_t q3,
+    const uint16x8_t q4, const uint16x8_t q5, const uint16x8_t q6,
+    const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, uint16x8_t *op4,
+    uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
+    uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, uint16x8_t *oq3,
+    uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6, const int bd) {
+  if (flat_status != (uint32_t)-4) {
+    filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
+  }
+
+  if (flat_status) {
+    *op2 = p2;
+    *oq2 = q2;
+    if (flat2_status != (uint32_t)-4) {
+      apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
+                         oq0, oq1, oq2);
+    }
+    if (flat2_status) {
+      apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3,
+                          q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0,
+                          oq0, oq1, oq2, oq3, oq4, oq5, oq6);
+    }
+  }
+}
+
+static INLINE void load_8x8(const uint16_t *s, const int p, uint16x8_t *p3,
+                            uint16x8_t *p2, uint16x8_t *p1, uint16x8_t *p0,
+                            uint16x8_t *q0, uint16x8_t *q1, uint16x8_t *q2,
+                            uint16x8_t *q3) {
+  *p3 = vld1q_u16(s);
+  s += p;
+  *p2 = vld1q_u16(s);
+  s += p;
+  *p1 = vld1q_u16(s);
+  s += p;
+  *p0 = vld1q_u16(s);
+  s += p;
+  *q0 = vld1q_u16(s);
+  s += p;
+  *q1 = vld1q_u16(s);
+  s += p;
+  *q2 = vld1q_u16(s);
+  s += p;
+  *q3 = vld1q_u16(s);
+}
+
+static INLINE void load_8x16(const uint16_t *s, const int p, uint16x8_t *s0,
+                             uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3,
+                             uint16x8_t *s4, uint16x8_t *s5, uint16x8_t *s6,
+                             uint16x8_t *s7, uint16x8_t *s8, uint16x8_t *s9,
+                             uint16x8_t *s10, uint16x8_t *s11, uint16x8_t *s12,
+                             uint16x8_t *s13, uint16x8_t *s14,
+                             uint16x8_t *s15) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+  *s4 = vld1q_u16(s);
+  s += p;
+  *s5 = vld1q_u16(s);
+  s += p;
+  *s6 = vld1q_u16(s);
+  s += p;
+  *s7 = vld1q_u16(s);
+  s += p;
+  *s8 = vld1q_u16(s);
+  s += p;
+  *s9 = vld1q_u16(s);
+  s += p;
+  *s10 = vld1q_u16(s);
+  s += p;
+  *s11 = vld1q_u16(s);
+  s += p;
+  *s12 = vld1q_u16(s);
+  s += p;
+  *s13 = vld1q_u16(s);
+  s += p;
+  *s14 = vld1q_u16(s);
+  s += p;
+  *s15 = vld1q_u16(s);
+}
+
+static INLINE void store_8x4(uint16_t *s, const int p, const uint16x8_t s0,
+                             const uint16x8_t s1, const uint16x8_t s2,
+                             const uint16x8_t s3) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+  s += p;
+  vst1q_u16(s, s3);
+}
+
+static INLINE void store_8x6(uint16_t *s, const int p, const uint16x8_t s0,
+                             const uint16x8_t s1, const uint16x8_t s2,
+                             const uint16x8_t s3, const uint16x8_t s4,
+                             const uint16x8_t s5) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+  s += p;
+  vst1q_u16(s, s3);
+  s += p;
+  vst1q_u16(s, s4);
+  s += p;
+  vst1q_u16(s, s5);
+}
+
+static INLINE void store_4x8(uint16_t *s, const int p, const uint16x8_t p1,
+                             const uint16x8_t p0, const uint16x8_t q0,
+                             const uint16x8_t q1) {
+  uint16x8x4_t o;
+
+  o.val[0] = p1;
+  o.val[1] = p0;
+  o.val[2] = q0;
+  o.val[3] = q1;
+  vst4q_lane_u16(s, o, 0);
+  s += p;
+  vst4q_lane_u16(s, o, 1);
+  s += p;
+  vst4q_lane_u16(s, o, 2);
+  s += p;
+  vst4q_lane_u16(s, o, 3);
+  s += p;
+  vst4q_lane_u16(s, o, 4);
+  s += p;
+  vst4q_lane_u16(s, o, 5);
+  s += p;
+  vst4q_lane_u16(s, o, 6);
+  s += p;
+  vst4q_lane_u16(s, o, 7);
+}
+
+static INLINE void store_6x8(uint16_t *s, const int p, const uint16x8_t s0,
+                             const uint16x8_t s1, const uint16x8_t s2,
+                             const uint16x8_t s3, const uint16x8_t s4,
+                             const uint16x8_t s5) {
+  uint16x8x3_t o0, o1;
+
+  o0.val[0] = s0;
+  o0.val[1] = s1;
+  o0.val[2] = s2;
+  o1.val[0] = s3;
+  o1.val[1] = s4;
+  o1.val[2] = s5;
+  vst3q_lane_u16(s - 3, o0, 0);
+  vst3q_lane_u16(s + 0, o1, 0);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 1);
+  vst3q_lane_u16(s + 0, o1, 1);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 2);
+  vst3q_lane_u16(s + 0, o1, 2);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 3);
+  vst3q_lane_u16(s + 0, o1, 3);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 4);
+  vst3q_lane_u16(s + 0, o1, 4);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 5);
+  vst3q_lane_u16(s + 0, o1, 5);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 6);
+  vst3q_lane_u16(s + 0, o1, 6);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 7);
+  vst3q_lane_u16(s + 0, o1, 7);
+}
+
+static INLINE void store_7x8(uint16_t *s, const int p, const uint16x8_t s0,
+                             const uint16x8_t s1, const uint16x8_t s2,
+                             const uint16x8_t s3, const uint16x8_t s4,
+                             const uint16x8_t s5, const uint16x8_t s6) {
+  uint16x8x4_t o0;
+  uint16x8x3_t o1;
+
+  o0.val[0] = s0;
+  o0.val[1] = s1;
+  o0.val[2] = s2;
+  o0.val[3] = s3;
+  o1.val[0] = s4;
+  o1.val[1] = s5;
+  o1.val[2] = s6;
+  vst4q_lane_u16(s - 4, o0, 0);
+  vst3q_lane_u16(s + 0, o1, 0);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 1);
+  vst3q_lane_u16(s + 0, o1, 1);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 2);
+  vst3q_lane_u16(s + 0, o1, 2);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 3);
+  vst3q_lane_u16(s + 0, o1, 3);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 4);
+  vst3q_lane_u16(s + 0, o1, 4);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 5);
+  vst3q_lane_u16(s + 0, o1, 5);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 6);
+  vst3q_lane_u16(s + 0, o1, 6);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 7);
+  vst3q_lane_u16(s + 0, o1, 7);
+}
+
+static INLINE void store_8x14(uint16_t *s, const int p, const uint16x8_t p6,
+                              const uint16x8_t p5, const uint16x8_t p4,
+                              const uint16x8_t p3, const uint16x8_t p2,
+                              const uint16x8_t p1, const uint16x8_t p0,
+                              const uint16x8_t q0, const uint16x8_t q1,
+                              const uint16x8_t q2, const uint16x8_t q3,
+                              const uint16x8_t q4, const uint16x8_t q5,
+                              const uint16x8_t q6, const uint32_t flat_status,
+                              const uint32_t flat2_status) {
+  if (flat_status) {
+    if (flat2_status) {
+      vst1q_u16(s - 7 * p, p6);
+      vst1q_u16(s - 6 * p, p5);
+      vst1q_u16(s - 5 * p, p4);
+      vst1q_u16(s - 4 * p, p3);
+      vst1q_u16(s + 3 * p, q3);
+      vst1q_u16(s + 4 * p, q4);
+      vst1q_u16(s + 5 * p, q5);
+      vst1q_u16(s + 6 * p, q6);
+    }
+    vst1q_u16(s - 3 * p, p2);
+    vst1q_u16(s + 2 * p, q2);
+  }
+  vst1q_u16(s - 2 * p, p1);
+  vst1q_u16(s - 1 * p, p0);
+  vst1q_u16(s + 0 * p, q0);
+  vst1q_u16(s + 1 * p, q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int p, const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      mask, hev;
+
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+                   q2, q3, &hev, &mask);
+  filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
+  store_8x4(s - 2 * p, p, p1, p0, q0, q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_neon(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_4_neon(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_neon(s + 8, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      mask, hev;
+
+  load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
+                    (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
+                    (int16x8_t *)&q2, (int16x8_t *)&q3);
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+                   q2, q3, &hev, &mask);
+  filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
+  store_4x8(s - 2, p, p1, p0, q0, q1);
+}
+
+void vpx_highbd_lpf_vertical_4_dual_neon(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int p, const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint32_t flat_status;
+
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+                              q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+  filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+          &op1, &op0, &oq0, &oq1, &oq2, bd);
+  store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_highbd_lpf_horizontal_8_dual_neon(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint32_t flat_status;
+
+  load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
+                    (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
+                    (int16x8_t *)&q2, (int16x8_t *)&q3);
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+                              q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+  filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+          &op1, &op0, &oq0, &oq1, &oq2, bd);
+  // Note: store_6x8() is faster than transpose + store_8x8().
+  store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_highbd_lpf_vertical_8_dual_neon(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
+}
+
+static void lpf_horizontal_16_kernel(uint16_t *s, int p,
+                                     const uint16x8_t blimit_vec,
+                                     const uint16x8_t limit_vec,
+                                     const uint16x8_t thresh_vec,
+                                     const int bd) {
+  uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
+      q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+      oq4, oq5, oq6;
+  uint32_t flat_status, flat2_status;
+
+  load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
+            &q3, &q4, &q5, &q6, &q7);
+  mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+                              q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+  flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
+                     &flat2_status, bd);
+  filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
+           p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
+           &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+           bd);
+  store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
+             oq5, oq6, flat_status, flat2_status);
+}
+
+static void lpf_vertical_16_kernel(uint16_t *s, int p,
+                                   const uint16x8_t blimit_vec,
+                                   const uint16x8_t limit_vec,
+                                   const uint16x8_t thresh_vec, const int bd) {
+  uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
+      q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+      oq4, oq5, oq6;
+  uint32_t flat_status, flat2_status;
+
+  load_8x8(s - 8, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
+  transpose_s16_8x8((int16x8_t *)&p7, (int16x8_t *)&p6, (int16x8_t *)&p5,
+                    (int16x8_t *)&p4, (int16x8_t *)&p3, (int16x8_t *)&p2,
+                    (int16x8_t *)&p1, (int16x8_t *)&p0);
+  load_8x8(s, p, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+  transpose_s16_8x8((int16x8_t *)&q0, (int16x8_t *)&q1, (int16x8_t *)&q2,
+                    (int16x8_t *)&q3, (int16x8_t *)&q4, (int16x8_t *)&q5,
+                    (int16x8_t *)&q6, (int16x8_t *)&q7);
+  mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+                              q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+  flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
+                     &flat2_status, bd);
+  filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
+           p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
+           &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+           bd);
+  if (flat_status) {
+    if (flat2_status) {
+      store_7x8(s - 3, p, op6, op5, op4, op3, op2, op1, op0);
+      store_7x8(s + 4, p, oq0, oq1, oq2, oq3, oq4, oq5, oq6);
+    } else {
+      // Note: store_6x8() is faster than transpose + store_8x8().
+      store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
+    }
+  } else {
+    store_4x8(s - 2, p, op1, op0, oq0, oq1);
+  }
+}
+
+void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int p,
+                                            const uint8_t *blimit,
+                                            const uint8_t *limit,
+                                            const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+  lpf_horizontal_16_kernel(s + 8, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int p, const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+  lpf_vertical_16_kernel(s + 8 * p, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
new file mode 100644
index 00000000000..1fde13e8d6d
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -0,0 +1,923 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0,
+                            int16x4_t *s1, int16x4_t *s2, int16x4_t *s3) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+}
+
+static INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0,
+                            uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+}
+
+static INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0,
+                            int16x8_t *s1, int16x8_t *s2, int16x8_t *s3,
+                            int16x8_t *s4, int16x8_t *s5, int16x8_t *s6,
+                            int16x8_t *s7) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+}
+
+static INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0,
+                             const uint16x8_t s1, const uint16x8_t s2,
+                             const uint16x8_t s3, const uint16x8_t s4,
+                             const uint16x8_t s5, const uint16x8_t s6,
+                             const uint16x8_t s7) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+  s += p;
+  vst1q_u16(s, s3);
+  s += p;
+  vst1q_u16(s, s4);
+  s += p;
+  vst1q_u16(s, s5);
+  s += p;
+  vst1q_u16(s, s6);
+  s += p;
+  vst1q_u16(s, s7);
+}
+
+static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t s4, const int16x4_t s5,
+                                    const int16x4_t s6, const int16x4_t s7,
+                                    const int16x8_t filters) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int32x4_t sum = vdupq_n_s32(0);
+
+  sum = vmlal_lane_s16(sum, s0, filters_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, filters_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
+  return sum;
+}
+
+static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+                                     const int16x8_t s2, const int16x8_t s3,
+                                     const int16x8_t s4, const int16x8_t s5,
+                                     const int16x8_t s6, const int16x8_t s7,
+                                     const int16x8_t filters,
+                                     const uint16x8_t max) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int32x4_t sum0 = vdupq_n_s32(0);
+  int32x4_t sum1 = vdupq_n_s32(0);
+  uint16x8_t d;
+
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filters_lo, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filters_lo, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
+  d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
+  d = vminq_u16(d, max);
+  return d;
+}
+
+void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                                     uint8_t *dst8, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y,  // unused
+                                     int y_step_q4,            // unused
+                                     int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
+                                 x_step_q4, filter_y, y_step_q4, w, h, bd);
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+    const int16x8_t filters = vld1q_s16(filter_x);
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    uint16x8_t t0, t1, t2, t3;
+
+    assert(!((intptr_t)dst & 3));
+    assert(!(dst_stride & 3));
+
+    src -= 3;
+
+    if (h == 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      int32x4_t d0, d1, d2, d3;
+      uint16x8_t d01, d23;
+
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u16_8x4(&t0, &t1, &t2, &t3);
+      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
+      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
+      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
+      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
+      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
+      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
+      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      src += 7;
+
+      do {
+        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
+        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
+
+        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+        d01 = vminq_u16(d01, max);
+        d23 = vminq_u16(d23, max);
+        transpose_u16_4x4q(&d01, &d23);
+
+        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
+        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
+        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
+        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        src += 4;
+        dst += 4;
+        w -= 4;
+      } while (w > 0);
+    } else {
+      int16x8_t t4, t5, t6, t7;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint16x8_t d0, d1, d2, d3;
+
+      if (w == 4) {
+        do {
+          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+                   &s5, &s6, &s7);
+          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
+                   &t4, &t5, &t6, &t7);
+          src += 8 * src_stride;
+          __builtin_prefetch(dst + 0 * dst_stride);
+          __builtin_prefetch(dst + 1 * dst_stride);
+          __builtin_prefetch(dst + 2 * dst_stride);
+          __builtin_prefetch(dst + 3 * dst_stride);
+          __builtin_prefetch(dst + 4 * dst_stride);
+          __builtin_prefetch(dst + 5 * dst_stride);
+          __builtin_prefetch(dst + 6 * dst_stride);
+          __builtin_prefetch(dst + 7 * dst_stride);
+          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
+
+          __builtin_prefetch(src + 0 * src_stride);
+          __builtin_prefetch(src + 1 * src_stride);
+          __builtin_prefetch(src + 2 * src_stride);
+          __builtin_prefetch(src + 3 * src_stride);
+          __builtin_prefetch(src + 4 * src_stride);
+          __builtin_prefetch(src + 5 * src_stride);
+          __builtin_prefetch(src + 6 * src_stride);
+          __builtin_prefetch(src + 7 * src_stride);
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+          transpose_u16_8x4(&d0, &d1, &d2, &d3);
+          vst1_u16(dst, vget_low_u16(d0));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d1));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d2));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d3));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d0));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d1));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d2));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d3));
+          dst += dst_stride;
+          h -= 8;
+        } while (h > 0);
+      } else {
+        int width;
+        const uint16_t *s;
+        uint16_t *d;
+        int16x8_t s11, s12, s13, s14;
+        uint16x8_t d4, d5, d6, d7;
+
+        do {
+          __builtin_prefetch(src + 0 * src_stride);
+          __builtin_prefetch(src + 1 * src_stride);
+          __builtin_prefetch(src + 2 * src_stride);
+          __builtin_prefetch(src + 3 * src_stride);
+          __builtin_prefetch(src + 4 * src_stride);
+          __builtin_prefetch(src + 5 * src_stride);
+          __builtin_prefetch(src + 6 * src_stride);
+          __builtin_prefetch(src + 7 * src_stride);
+          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+                   &s5, &s6, &s7);
+          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+          width = w;
+          s = src + 7;
+          d = dst;
+          __builtin_prefetch(dst + 0 * dst_stride);
+          __builtin_prefetch(dst + 1 * dst_stride);
+          __builtin_prefetch(dst + 2 * dst_stride);
+          __builtin_prefetch(dst + 3 * dst_stride);
+          __builtin_prefetch(dst + 4 * dst_stride);
+          __builtin_prefetch(dst + 5 * dst_stride);
+          __builtin_prefetch(dst + 6 * dst_stride);
+          __builtin_prefetch(dst + 7 * dst_stride);
+
+          do {
+            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
+                     &s12, &s13, &s14);
+            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
+
+            d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+            d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+            d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+            d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+            d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
+            d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
+            d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
+            d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
+
+            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+            s0 = s8;
+            s1 = s9;
+            s2 = s10;
+            s3 = s11;
+            s4 = s12;
+            s5 = s13;
+            s6 = s14;
+            s += 8;
+            d += 8;
+            width -= 8;
+          } while (width > 0);
+          src += 8 * src_stride;
+          dst += 8 * dst_stride;
+          h -= 8;
+        } while (h > 0);
+      }
+    }
+  }
+}
+
+void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8,
+                                         ptrdiff_t src_stride, uint8_t *dst8,
+                                         ptrdiff_t dst_stride,
+                                         const int16_t *filter_x, int x_step_q4,
+                                         const int16_t *filter_y,  // unused
+                                         int y_step_q4,            // unused
+                                         int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_horiz_c(src8, src_stride, dst8, dst_stride,
+                                     filter_x, x_step_q4, filter_y, y_step_q4,
+                                     w, h, bd);
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+    const int16x8_t filters = vld1q_s16(filter_x);
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    uint16x8_t t0, t1, t2, t3;
+
+    assert(!((intptr_t)dst & 3));
+    assert(!(dst_stride & 3));
+
+    src -= 3;
+
+    if (h == 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      int32x4_t d0, d1, d2, d3;
+      uint16x8_t d01, d23, t01, t23;
+
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u16_8x4(&t0, &t1, &t2, &t3);
+      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
+      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
+      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
+      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
+      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
+      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
+      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      src += 7;
+
+      do {
+        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
+        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
+
+        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+        t01 = vminq_u16(t01, max);
+        t23 = vminq_u16(t23, max);
+        transpose_u16_4x4q(&t01, &t23);
+
+        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+                           vld1_u16(dst + 2 * dst_stride));
+        d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
+                           vld1_u16(dst + 3 * dst_stride));
+        d01 = vrhaddq_u16(d01, t01);
+        d23 = vrhaddq_u16(d23, t23);
+
+        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
+        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
+        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
+        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        src += 4;
+        dst += 4;
+        w -= 4;
+      } while (w > 0);
+    } else {
+      int16x8_t t4, t5, t6, t7;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+
+      if (w == 4) {
+        do {
+          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+                   &s5, &s6, &s7);
+          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
+                   &t4, &t5, &t6, &t7);
+          src += 8 * src_stride;
+          __builtin_prefetch(dst + 0 * dst_stride);
+          __builtin_prefetch(dst + 1 * dst_stride);
+          __builtin_prefetch(dst + 2 * dst_stride);
+          __builtin_prefetch(dst + 3 * dst_stride);
+          __builtin_prefetch(dst + 4 * dst_stride);
+          __builtin_prefetch(dst + 5 * dst_stride);
+          __builtin_prefetch(dst + 6 * dst_stride);
+          __builtin_prefetch(dst + 7 * dst_stride);
+          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
+
+          __builtin_prefetch(src + 0 * src_stride);
+          __builtin_prefetch(src + 1 * src_stride);
+          __builtin_prefetch(src + 2 * src_stride);
+          __builtin_prefetch(src + 3 * src_stride);
+          __builtin_prefetch(src + 4 * src_stride);
+          __builtin_prefetch(src + 5 * src_stride);
+          __builtin_prefetch(src + 6 * src_stride);
+          __builtin_prefetch(src + 7 * src_stride);
+          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+          transpose_u16_8x4(&t0, &t1, &t2, &t3);
+
+          d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+                            vld1_u16(dst + 4 * dst_stride));
+          d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
+                            vld1_u16(dst + 5 * dst_stride));
+          d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
+                            vld1_u16(dst + 6 * dst_stride));
+          d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
+                            vld1_u16(dst + 7 * dst_stride));
+          d0 = vrhaddq_u16(d0, t0);
+          d1 = vrhaddq_u16(d1, t1);
+          d2 = vrhaddq_u16(d2, t2);
+          d3 = vrhaddq_u16(d3, t3);
+
+          vst1_u16(dst, vget_low_u16(d0));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d1));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d2));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d3));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d0));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d1));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d2));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d3));
+          dst += dst_stride;
+          h -= 8;
+        } while (h > 0);
+      } else {
+        int width;
+        const uint16_t *s;
+        uint16_t *d;
+        int16x8_t s11, s12, s13, s14;
+        uint16x8_t d4, d5, d6, d7;
+
+        do {
+          __builtin_prefetch(src + 0 * src_stride);
+          __builtin_prefetch(src + 1 * src_stride);
+          __builtin_prefetch(src + 2 * src_stride);
+          __builtin_prefetch(src + 3 * src_stride);
+          __builtin_prefetch(src + 4 * src_stride);
+          __builtin_prefetch(src + 5 * src_stride);
+          __builtin_prefetch(src + 6 * src_stride);
+          __builtin_prefetch(src + 7 * src_stride);
+          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+                   &s5, &s6, &s7);
+          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+          width = w;
+          s = src + 7;
+          d = dst;
+          __builtin_prefetch(dst + 0 * dst_stride);
+          __builtin_prefetch(dst + 1 * dst_stride);
+          __builtin_prefetch(dst + 2 * dst_stride);
+          __builtin_prefetch(dst + 3 * dst_stride);
+          __builtin_prefetch(dst + 4 * dst_stride);
+          __builtin_prefetch(dst + 5 * dst_stride);
+          __builtin_prefetch(dst + 6 * dst_stride);
+          __builtin_prefetch(dst + 7 * dst_stride);
+
+          do {
+            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
+                     &s12, &s13, &s14);
+            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
+
+            d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+            d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+            d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+            d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+            d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
+            d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
+            d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
+            d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
+
+            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+            d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+            d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+            d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+            d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+            d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
+            d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
+            d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
+            d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
+
+            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+            s0 = s8;
+            s1 = s9;
+            s2 = s10;
+            s3 = s11;
+            s4 = s12;
+            s5 = s13;
+            s6 = s14;
+            s += 8;
+            d += 8;
+            width -= 8;
+          } while (width > 0);
+          src += 8 * src_stride;
+          dst += 8 * dst_stride;
+          h -= 8;
+        } while (h > 0);
+      }
+    }
+  }
+}
+
+void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                                    uint8_t *dst8, ptrdiff_t dst_stride,
+                                    const int16_t *filter_x,  // unused
+                                    int x_step_q4,            // unused
+                                    const int16_t *filter_y, int y_step_q4,
+                                    int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
+                                x_step_q4, filter_y, y_step_q4, w, h, bd);
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+    const int16x8_t filters = vld1q_s16(filter_y);
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    assert(!((intptr_t)dst & 3));
+    assert(!(dst_stride & 3));
+
+    src -= 3 * src_stride;
+
+    if (w == 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      int32x4_t d0, d1, d2, d3;
+      uint16x8_t d01, d23;
+
+      s0 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s1 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s2 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s3 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s4 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s5 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s6 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+
+      do {
+        s7 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s8 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s9 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s10 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+        d01 = vminq_u16(d01, max);
+        d23 = vminq_u16(d23, max);
+        vst1_u16(dst, vget_low_u16(d01));
+        dst += dst_stride;
+        vst1_u16(dst, vget_high_u16(d01));
+        dst += dst_stride;
+        vst1_u16(dst, vget_low_u16(d23));
+        dst += dst_stride;
+        vst1_u16(dst, vget_high_u16(d23));
+        dst += dst_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        h -= 4;
+      } while (h > 0);
+    } else {
+      int height;
+      const uint16_t *s;
+      uint16_t *d;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint16x8_t d0, d1, d2, d3;
+
+      do {
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        s = src;
+        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        d = dst;
+        height = h;
+
+        do {
+          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+          __builtin_prefetch(s + 0 * src_stride);
+          __builtin_prefetch(s + 1 * src_stride);
+          __builtin_prefetch(s + 2 * src_stride);
+          __builtin_prefetch(s + 3 * src_stride);
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+          vst1q_u16(d, d0);
+          d += dst_stride;
+          vst1q_u16(d, d1);
+          d += dst_stride;
+          vst1q_u16(d, d2);
+          d += dst_stride;
+          vst1q_u16(d, d3);
+          d += dst_stride;
+
+          s0 = s4;
+          s1 = s5;
+          s2 = s6;
+          s3 = s7;
+          s4 = s8;
+          s5 = s9;
+          s6 = s10;
+          height -= 4;
+        } while (height > 0);
+        src += 8;
+        dst += 8;
+        w -= 8;
+      } while (w > 0);
+    }
+  }
+}
+
+void vpx_highbd_convolve8_avg_vert_neon(const uint8_t *src8,
+                                        ptrdiff_t src_stride, uint8_t *dst8,
+                                        ptrdiff_t dst_stride,
+                                        const int16_t *filter_x,  // unused
+                                        int x_step_q4,            // unused
+                                        const int16_t *filter_y, int y_step_q4,
+                                        int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_vert_c(src8, src_stride, dst8, dst_stride,
+                                    filter_x, x_step_q4, filter_y, y_step_q4, w,
+                                    h, bd);
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+    const int16x8_t filters = vld1q_s16(filter_y);
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    assert(!((intptr_t)dst & 3));
+    assert(!(dst_stride & 3));
+
+    src -= 3 * src_stride;
+
+    if (w == 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      int32x4_t d0, d1, d2, d3;
+      uint16x8_t d01, d23, t01, t23;
+
+      s0 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s1 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s2 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s3 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s4 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s5 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s6 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+
+      do {
+        s7 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s8 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s9 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s10 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+        t01 = vminq_u16(t01, max);
+        t23 = vminq_u16(t23, max);
+
+        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+                           vld1_u16(dst + 1 * dst_stride));
+        d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
+                           vld1_u16(dst + 3 * dst_stride));
+        d01 = vrhaddq_u16(d01, t01);
+        d23 = vrhaddq_u16(d23, t23);
+
+        vst1_u16(dst, vget_low_u16(d01));
+        dst += dst_stride;
+        vst1_u16(dst, vget_high_u16(d01));
+        dst += dst_stride;
+        vst1_u16(dst, vget_low_u16(d23));
+        dst += dst_stride;
+        vst1_u16(dst, vget_high_u16(d23));
+        dst += dst_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        h -= 4;
+      } while (h > 0);
+    } else {
+      int height;
+      const uint16_t *s;
+      uint16_t *d;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+
+      do {
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        s = src;
+        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        d = dst;
+        height = h;
+
+        do {
+          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+          __builtin_prefetch(s + 0 * src_stride);
+          __builtin_prefetch(s + 1 * src_stride);
+          __builtin_prefetch(s + 2 * src_stride);
+          __builtin_prefetch(s + 3 * src_stride);
+          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+          d0 = vld1q_u16(d + 0 * dst_stride);
+          d1 = vld1q_u16(d + 1 * dst_stride);
+          d2 = vld1q_u16(d + 2 * dst_stride);
+          d3 = vld1q_u16(d + 3 * dst_stride);
+          d0 = vrhaddq_u16(d0, t0);
+          d1 = vrhaddq_u16(d1, t1);
+          d2 = vrhaddq_u16(d2, t2);
+          d3 = vrhaddq_u16(d3, t3);
+
+          vst1q_u16(d, d0);
+          d += dst_stride;
+          vst1q_u16(d, d1);
+          d += dst_stride;
+          vst1q_u16(d, d2);
+          d += dst_stride;
+          vst1q_u16(d, d3);
+          d += dst_stride;
+
+          s0 = s4;
+          s1 = s5;
+          s2 = s6;
+          s3 = s7;
+          s4 = s8;
+          s5 = s9;
+          s6 = s10;
+          height -= 4;
+        } while (height > 0);
+        src += 8;
+        dst += 8;
+        w -= 8;
+      } while (w > 0);
+    }
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
new file mode 100644
index 00000000000..f4d70761eb3
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                                  uint8_t *dst8, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int filter_x_stride,
+                                  const int16_t *filter_y, int filter_y_stride,
+                                  int w, int h, int bd) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+  (void)bd;
+
+  if (w < 8) {  // avg4
+    uint16x4_t s0, s1, d0, d1;
+    uint16x8_t s01, d01;
+    do {
+      s0 = vld1_u16(src);
+      d0 = vld1_u16(dst);
+      src += src_stride;
+      s1 = vld1_u16(src);
+      d1 = vld1_u16(dst + dst_stride);
+      src += src_stride;
+      s01 = vcombine_u16(s0, s1);
+      d01 = vcombine_u16(d0, d1);
+      d01 = vrhaddq_u16(s01, d01);
+      vst1_u16(dst, vget_low_u16(d01));
+      dst += dst_stride;
+      vst1_u16(dst, vget_high_u16(d01));
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w == 8) {  // avg8
+    uint16x8_t s0, s1, d0, d1;
+    do {
+      s0 = vld1q_u16(src);
+      d0 = vld1q_u16(dst);
+      src += src_stride;
+      s1 = vld1q_u16(src);
+      d1 = vld1q_u16(dst + dst_stride);
+      src += src_stride;
+
+      d0 = vrhaddq_u16(s0, d0);
+      d1 = vrhaddq_u16(s1, d1);
+
+      vst1q_u16(dst, d0);
+      dst += dst_stride;
+      vst1q_u16(dst, d1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w < 32) {  // avg16
+    uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h;
+    do {
+      s0l = vld1q_u16(src);
+      s0h = vld1q_u16(src + 8);
+      d0l = vld1q_u16(dst);
+      d0h = vld1q_u16(dst + 8);
+      src += src_stride;
+      s1l = vld1q_u16(src);
+      s1h = vld1q_u16(src + 8);
+      d1l = vld1q_u16(dst + dst_stride);
+      d1h = vld1q_u16(dst + dst_stride + 8);
+      src += src_stride;
+
+      d0l = vrhaddq_u16(s0l, d0l);
+      d0h = vrhaddq_u16(s0h, d0h);
+      d1l = vrhaddq_u16(s1l, d1l);
+      d1h = vrhaddq_u16(s1h, d1h);
+
+      vst1q_u16(dst, d0l);
+      vst1q_u16(dst + 8, d0h);
+      dst += dst_stride;
+      vst1q_u16(dst, d1l);
+      vst1q_u16(dst + 8, d1h);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w == 32) {  // avg32
+    uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+    do {
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      d0 = vld1q_u16(dst);
+      d1 = vld1q_u16(dst + 8);
+      d2 = vld1q_u16(dst + 16);
+      d3 = vld1q_u16(dst + 24);
+      src += src_stride;
+
+      d0 = vrhaddq_u16(s0, d0);
+      d1 = vrhaddq_u16(s1, d1);
+      d2 = vrhaddq_u16(s2, d2);
+      d3 = vrhaddq_u16(s3, d3);
+
+      vst1q_u16(dst, d0);
+      vst1q_u16(dst + 8, d1);
+      vst1q_u16(dst + 16, d2);
+      vst1q_u16(dst + 24, d3);
+      dst += dst_stride;
+
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      d0 = vld1q_u16(dst);
+      d1 = vld1q_u16(dst + 8);
+      d2 = vld1q_u16(dst + 16);
+      d3 = vld1q_u16(dst + 24);
+      src += src_stride;
+
+      d0 = vrhaddq_u16(s0, d0);
+      d1 = vrhaddq_u16(s1, d1);
+      d2 = vrhaddq_u16(s2, d2);
+      d3 = vrhaddq_u16(s3, d3);
+
+      vst1q_u16(dst, d0);
+      vst1q_u16(dst + 8, d1);
+      vst1q_u16(dst + 16, d2);
+      vst1q_u16(dst + 24, d3);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {  // avg64
+    uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+    do {
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      d0 = vld1q_u16(dst);
+      d1 = vld1q_u16(dst + 8);
+      d2 = vld1q_u16(dst + 16);
+      d3 = vld1q_u16(dst + 24);
+
+      d0 = vrhaddq_u16(s0, d0);
+      d1 = vrhaddq_u16(s1, d1);
+      d2 = vrhaddq_u16(s2, d2);
+      d3 = vrhaddq_u16(s3, d3);
+
+      vst1q_u16(dst, d0);
+      vst1q_u16(dst + 8, d1);
+      vst1q_u16(dst + 16, d2);
+      vst1q_u16(dst + 24, d3);
+
+      s0 = vld1q_u16(src + 32);
+      s1 = vld1q_u16(src + 40);
+      s2 = vld1q_u16(src + 48);
+      s3 = vld1q_u16(src + 56);
+      d0 = vld1q_u16(dst + 32);
+      d1 = vld1q_u16(dst + 40);
+      d2 = vld1q_u16(dst + 48);
+      d3 = vld1q_u16(dst + 56);
+
+      d0 = vrhaddq_u16(s0, d0);
+      d1 = vrhaddq_u16(s1, d1);
+      d2 = vrhaddq_u16(s2, d2);
+      d3 = vrhaddq_u16(s3, d3);
+
+      vst1q_u16(dst + 32, d0);
+      vst1q_u16(dst + 40, d1);
+      vst1q_u16(dst + 48, d2);
+      vst1q_u16(dst + 56, d3);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h);
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
new file mode 100644
index 00000000000..a980ab1a380
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                                   uint8_t *dst8, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int filter_x_stride,
+                                   const int16_t *filter_y, int filter_y_stride,
+                                   int w, int h, int bd) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+  (void)bd;
+
+  if (w < 8) {  // copy4
+    do {
+      vst1_u16(dst, vld1_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst1_u16(dst, vld1_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w == 8) {  // copy8
+    do {
+      vst1q_u16(dst, vld1q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst1q_u16(dst, vld1q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w < 32) {  // copy16
+    do {
+      vst2q_u16(dst, vld2q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst2q_u16(dst, vld2q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst2q_u16(dst, vld2q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst2q_u16(dst, vld2q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else if (w == 32) {  // copy32
+    do {
+      vst4q_u16(dst, vld4q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {  // copy64
+    do {
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
new file mode 100644
index 00000000000..4e6e109920a
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h, int bd) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
+  // + 1 to make it divisible by 4
+  DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  /* Filter starting 3 lines back. The neon implementation will ignore the given
+   * height and filter a multiple of 4 lines. Since this goes in to the temp
+   * buffer which has lots of extra room and is subsequently discarded this is
+   * safe if somewhat less than ideal.   */
+  vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3),
+                                  src_stride, CONVERT_TO_BYTEPTR(temp), w,
+                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
+                                  intermediate_height, bd);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst,
+                                 dst_stride, filter_x, x_step_q4, filter_y,
+                                 y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h, int bd) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
+  // + 1 to make it divisible by 4
+  DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  /* This implementation has the same issues as above. In addition, we only want
+   * to average the values after both passes.
+   */
+  vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3),
+                                  src_stride, CONVERT_TO_BYTEPTR(temp), w,
+                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
+                                  intermediate_height, bd);
+  vpx_highbd_convolve8_avg_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst,
+                                     dst_stride, filter_x, x_step_q4, filter_y,
+                                     y_step_q4, w, h, bd);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
index dc459e20d9c..e3c0c5210d2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
@@ -25,9 +25,8 @@
 |vpx_idct16x16_1_add_neon| PROC
     ldrsh            r0, [r0]
 
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
+    ; cospi_16_64 = 11585
+    movw             r12, #0x2d41
 
     ; out = dct_const_round_shift(input[0] * cospi_16_64)
     mul              r0, r0, r12               ; input[0] * cospi_16_64
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
index 4035830f3c8..f1e49ff5178 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -21,7 +21,7 @@ void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
   uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
   int16x8_t q0s16;
   uint8_t *d1, *d2;
-  int16_t i, j, a1, cospi_16_64 = 11585;
+  int16_t i, j, a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 6);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
index 22a0c95941a..5e64cea0ae7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
@@ -60,13 +60,11 @@
     vld2.s16        {q1,q2}, [r0]!
     vmov.s16        q15, q1
 
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0xc00
-    add             r3, #0x7c
+    ; cospi_28_64 = 3196
+    movw            r3, #0x0c7c
 
-    ; generate cospi_4_64  = 16069
-    mov             r12, #0x3e00
-    add             r12, #0xc5
+    ; cospi_4_64  = 16069
+    movw            r12, #0x3ec5
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -76,13 +74,11 @@
     vdup.16         d1, r12                   ; duplicate cospi_4_64
 
     ; preloading to avoid stall
-    ; generate cospi_12_64 = 13623
-    mov             r3, #0x3500
-    add             r3, #0x37
+    ; cospi_12_64 = 13623
+    movw            r3, #0x3537
 
-    ; generate cospi_20_64 = 9102
-    mov             r12, #0x2300
-    add             r12, #0x8e
+    ; cospi_20_64 = 9102
+    movw            r12, #0x238e
 
     ; step2[4] * cospi_28_64
     vmull.s16       q2, d18, d0
@@ -112,13 +108,11 @@
     vqrshrn.s32     d15, q6, #14              ; >> 14
 
     ; preloading to avoid stall
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
+    ; cospi_16_64 = 11585
+    movw            r3, #0x2d41
 
-    ; generate cospi_24_64 = 6270
-    mov             r12, #0x1800
-    add             r12, #0x7e
+    ; cospi_24_64 = 6270
+    movw            r12, #0x187e
 
     ; step2[5] * cospi_12_64
     vmull.s16       q2, d26, d2
@@ -155,9 +149,8 @@
     vmull.s16       q0, d24, d30
     vmull.s16       q1, d25, d30
 
-    ; generate cospi_8_64 = 15137
-    mov             r3, #0x3b00
-    add             r3, #0x21
+    ; cospi_8_64 = 15137
+    movw            r3, #0x3b21
 
     vdup.16         d30, r12                  ; duplicate cospi_24_64
     vdup.16         d31, r3                   ; duplicate cospi_8_64
@@ -208,9 +201,8 @@
     vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];
     vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];
 
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
+    ; cospi_16_64 = 11585
+    movw            r3, #0x2d41
 
     ; stage 5
     vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];
@@ -307,13 +299,11 @@
     vld2.s16        {q0,q1}, [r0]!
     vmov.s16        q15, q0;
 
-    ; generate  cospi_30_64 = 1606
-    mov             r3, #0x0600
-    add             r3, #0x46
+    ; cospi_30_64 = 1606
+    movw            r3, #0x0646
 
-    ; generate cospi_2_64  = 16305
-    mov             r12, #0x3f00
-    add             r12, #0xb1
+    ; cospi_2_64  = 16305
+    movw            r12, #0x3fb1
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -323,13 +313,11 @@
     vdup.16         d13, r12                  ; duplicate cospi_2_64
 
     ; preloading to avoid stall
-    ; generate cospi_14_64 = 12665
-    mov             r3, #0x3100
-    add             r3, #0x79
+    ; cospi_14_64 = 12665
+    movw            r3, #0x3179
 
-    ; generate cospi_18_64 = 10394
-    mov             r12, #0x2800
-    add             r12, #0x9a
+    ; cospi_18_64 = 10394
+    movw            r12, #0x289a
 
     ; step1[8] * cospi_30_64
     vmull.s16       q2, d16, d12
@@ -359,13 +347,11 @@
     vqrshrn.s32     d15, q4, #14              ; >> 14
 
     ; preloading to avoid stall
-    ; generate cospi_22_64 = 7723
-    mov             r3, #0x1e00
-    add             r3, #0x2b
+    ; cospi_22_64 = 7723
+    movw            r3, #0x1e2b
 
-    ; generate cospi_10_64 = 14449
-    mov             r12, #0x3800
-    add             r12, #0x71
+    ; cospi_10_64 = 14449
+    movw            r12, #0x3871
 
     ; step1[9] * cospi_14_64
     vmull.s16       q2, d24, d30
@@ -411,13 +397,11 @@
     vmlal.s16       q5, d27, d30
 
     ; preloading to avoid stall
-    ; generate cospi_6_64 = 15679
-    mov             r3, #0x3d00
-    add             r3, #0x3f
+    ; cospi_6_64 = 15679
+    movw            r3, #0x3d3f
 
-    ; generate cospi_26_64 = 4756
-    mov             r12, #0x1200
-    add             r12, #0x94
+    ; cospi_26_64 = 4756
+    movw            r12, #0x1294
 
     vdup.16         d30, r3                   ; duplicate cospi_6_64
     vdup.16         d31, r12                  ; duplicate cospi_26_64
@@ -466,13 +450,11 @@
     vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]
 
     ; stage 4
-    ; generate cospi_24_64 = 6270
-    mov             r3, #0x1800
-    add             r3, #0x7e
+    ; cospi_24_64 = 6270
+    movw            r3, #0x187e
 
-    ; generate cospi_8_64 = 15137
-    mov             r12, #0x3b00
-    add             r12, #0x21
+    ; cospi_8_64 = 15137
+    movw            r12, #0x3b21
 
     ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
     vdup.16         d30, r12                  ; duplicate cospi_8_64
@@ -543,9 +525,8 @@
     vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
 
     ; stage 6.
-    ; generate cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
+    ; cospi_16_64 = 11585
+    movw            r12, #0x2d41
 
     vdup.16         d14, r12                  ; duplicate cospi_16_64
 
@@ -810,13 +791,11 @@ end_idct16x16_pass2
     vld2.s16        {q1,q2}, [r0]!
     vmov.s16        q15, q1
 
-    ; generate  cospi_28_64*2 = 6392
-    mov             r3, #0x1800
-    add             r3, #0xf8
+    ; cospi_28_64*2 = 6392
+    movw            r3, #0x18f8
 
-    ; generate cospi_4_64*2  = 32138
-    mov             r12, #0x7d00
-    add             r12, #0x8a
+    ; cospi_4_64*2  = 32138
+    movw            r12, #0x7d8a
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -833,9 +812,8 @@ end_idct16x16_pass2
     vqrdmulh.s16    q4, q9, q0
 
     ; preloading to avoid stall
-    ; generate cospi_16_64*2 = 23170
-    mov             r3, #0x5a00
-    add             r3, #0x82
+    ; cospi_16_64*2 = 23170
+    movw            r3, #0x5a82
 
     ; dct_const_round_shift(step2[4] * cospi_4_64);
     vqrdmulh.s16    q7, q9, q1
@@ -843,9 +821,8 @@ end_idct16x16_pass2
     ; stage 4
     vdup.16         q1, r3                    ; cospi_16_64*2
 
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
+    ; cospi_16_64 = 11585
+    movw            r3, #0x2d41
 
     vdup.16         d4, r3;                   ; duplicate cospi_16_64
 
@@ -939,13 +916,11 @@ end_idct16x16_pass2
     vld2.s16        {q0,q1}, [r0]!
     vmov.s16        q15, q0;
 
-    ; generate 2*cospi_30_64 = 3212
-    mov             r3, #0xc00
-    add             r3, #0x8c
+    ; 2*cospi_30_64 = 3212
+    movw            r3, #0x0c8c
 
-    ; generate 2*cospi_2_64  = 32610
-    mov             r12, #0x7f00
-    add             r12, #0x62
+    ; 2*cospi_2_64  = 32610
+    movw            r12, #0x7f62
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -962,15 +937,13 @@ end_idct16x16_pass2
     vqrdmulh.s16    q7, q8, q6
 
     ; preloading to avoid stall
-    ; generate 2*cospi_26_64 = 9512
-    mov             r12, #0x2500
-    add             r12, #0x28
+    ; 2*cospi_26_64 = 9512
+    movw            r12, #0x2528
     rsb             r12, #0
     vdup.16         q15, r12                  ; duplicate -2*cospi_26_64
 
-    ; generate 2*cospi_6_64 = 31358
-    mov             r3, #0x7a00
-    add             r3, #0x7e
+    ; 2*cospi_6_64 = 31358
+    movw            r3, #0x7a7e
     vdup.16         q14, r3                   ; duplicate 2*cospi_6_64
 
     ; dct_const_round_shift(- step1[12] * cospi_26_64)
@@ -980,14 +953,12 @@ end_idct16x16_pass2
     vqrdmulh.s16    q4, q9, q14
 
     ; stage 4
-    ; generate cospi_24_64 = 6270
-    mov             r3, #0x1800
-    add             r3, #0x7e
+    ; cospi_24_64 = 6270
+    movw            r3, #0x187e
     vdup.16         d31, r3                   ; duplicate cospi_24_64
 
-    ; generate cospi_8_64 = 15137
-    mov             r12, #0x3b00
-    add             r12, #0x21
+    ; cospi_8_64 = 15137
+    movw            r12, #0x3b21
     vdup.16         d30, r12                  ; duplicate cospi_8_64
 
     ; step1[14] * cospi_24_64
@@ -1052,9 +1023,8 @@ end_idct16x16_pass2
     vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
 
     ; stage 6.
-    ; generate cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
+    ; cospi_16_64 = 11585
+    movw            r12, #0x2d41
 
     vdup.16         d14, r12                  ; duplicate cospi_16_64
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
index ce5cbcbcda5..f682afc7bf6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -73,8 +73,8 @@ void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
   d31s16 = vget_high_s16(q15s16);
 
   // stage 3
-  d0s16 = vdup_n_s16(cospi_28_64);
-  d1s16 = vdup_n_s16(cospi_4_64);
+  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
+  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
 
   q2s32 = vmull_s16(d18s16, d0s16);
   q3s32 = vmull_s16(d19s16, d0s16);
@@ -86,8 +86,8 @@ void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
   q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
   q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
 
-  d2s16 = vdup_n_s16(cospi_12_64);
-  d3s16 = vdup_n_s16(cospi_20_64);
+  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
+  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
 
   d8s16 = vqrshrn_n_s32(q2s32, 14);
   d9s16 = vqrshrn_n_s32(q3s32, 14);
@@ -114,15 +114,15 @@ void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
   q6s16 = vcombine_s16(d12s16, d13s16);
 
   // stage 4
-  d30s16 = vdup_n_s16(cospi_16_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q2s32 = vmull_s16(d16s16, d30s16);
   q11s32 = vmull_s16(d17s16, d30s16);
   q0s32 = vmull_s16(d24s16, d30s16);
   q1s32 = vmull_s16(d25s16, d30s16);
 
-  d30s16 = vdup_n_s16(cospi_24_64);
-  d31s16 = vdup_n_s16(cospi_8_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_24_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_8_64);
 
   q3s32 = vaddq_s32(q2s32, q0s32);
   q12s32 = vaddq_s32(q11s32, q1s32);
@@ -168,7 +168,7 @@ void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
   q2s16 = vsubq_s16(q9s16, q10s16);
   q3s16 = vsubq_s16(q8s16, q11s16);
 
-  d16s16 = vdup_n_s16(cospi_16_64);
+  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q11s32 = vmull_s16(d26s16, d16s16);
   q12s32 = vmull_s16(d27s16, d16s16);
@@ -313,8 +313,8 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   d31s16 = vget_high_s16(q15s16);
 
   // stage 3
-  d12s16 = vdup_n_s16(cospi_30_64);
-  d13s16 = vdup_n_s16(cospi_2_64);
+  d12s16 = vdup_n_s16((int16_t)cospi_30_64);
+  d13s16 = vdup_n_s16((int16_t)cospi_2_64);
 
   q2s32 = vmull_s16(d16s16, d12s16);
   q3s32 = vmull_s16(d17s16, d12s16);
@@ -333,8 +333,8 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   q0s16 = vcombine_s16(d0s16, d1s16);
   q7s16 = vcombine_s16(d14s16, d15s16);
 
-  d30s16 = vdup_n_s16(cospi_14_64);
-  d31s16 = vdup_n_s16(cospi_18_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_14_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_18_64);
 
   q2s32 = vmull_s16(d24s16, d30s16);
   q3s32 = vmull_s16(d25s16, d30s16);
@@ -353,8 +353,8 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   q1s16 = vcombine_s16(d2s16, d3s16);
   q6s16 = vcombine_s16(d12s16, d13s16);
 
-  d30s16 = vdup_n_s16(cospi_22_64);
-  d31s16 = vdup_n_s16(cospi_10_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_22_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_10_64);
 
   q11s32 = vmull_s16(d20s16, d30s16);
   q12s32 = vmull_s16(d21s16, d30s16);
@@ -373,8 +373,8 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   q2s16 = vcombine_s16(d4s16, d5s16);
   q5s16 = vcombine_s16(d10s16, d11s16);
 
-  d30s16 = vdup_n_s16(cospi_6_64);
-  d31s16 = vdup_n_s16(cospi_26_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_6_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_26_64);
 
   q10s32 = vmull_s16(d28s16, d30s16);
   q11s32 = vmull_s16(d29s16, d30s16);
@@ -413,8 +413,8 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   d28s16 = vget_low_s16(q14s16);
   d29s16 = vget_high_s16(q14s16);
 
-  d30s16 = vdup_n_s16(cospi_8_64);
-  d31s16 = vdup_n_s16(cospi_24_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
 
   q2s32 = vmull_s16(d18s16, d31s16);
   q3s32 = vmull_s16(d19s16, d31s16);
@@ -474,7 +474,7 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   d26s16 = vget_low_s16(q13s16);
   d27s16 = vget_high_s16(q13s16);
 
-  d14s16 = vdup_n_s16(cospi_16_64);
+  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q3s32 = vmull_s16(d26s16, d14s16);
   q4s32 = vmull_s16(d27s16, d14s16);
@@ -837,15 +837,15 @@ void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
                     &q15s16);
 
   // stage 3
-  q0s16 = vdupq_n_s16(cospi_28_64 * 2);
-  q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+  q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
+  q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
 
   q4s16 = vqrdmulhq_s16(q9s16, q0s16);
   q7s16 = vqrdmulhq_s16(q9s16, q1s16);
 
   // stage 4
-  q1s16 = vdupq_n_s16(cospi_16_64 * 2);
-  d4s16 = vdup_n_s16(cospi_16_64);
+  q1s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
+  d4s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q8s16 = vqrdmulhq_s16(q8s16, q1s16);
 
@@ -979,13 +979,13 @@ void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
                     &q15s16);
 
   // stage 3
-  q6s16 = vdupq_n_s16(cospi_30_64 * 2);
+  q6s16 = vdupq_n_s16((int16_t)cospi_30_64 * 2);
   q0s16 = vqrdmulhq_s16(q8s16, q6s16);
-  q6s16 = vdupq_n_s16(cospi_2_64 * 2);
+  q6s16 = vdupq_n_s16((int16_t)cospi_2_64 * 2);
   q7s16 = vqrdmulhq_s16(q8s16, q6s16);
 
-  q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
-  q14s16 = vdupq_n_s16(cospi_6_64 * 2);
+  q15s16 = vdupq_n_s16((int16_t)-cospi_26_64 * 2);
+  q14s16 = vdupq_n_s16((int16_t)cospi_6_64 * 2);
   q3s16 = vqrdmulhq_s16(q9s16, q15s16);
   q4s16 = vqrdmulhq_s16(q9s16, q14s16);
 
@@ -999,8 +999,8 @@ void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
   d14s16 = vget_low_s16(q7s16);
   d15s16 = vget_high_s16(q7s16);
 
-  d30s16 = vdup_n_s16(cospi_8_64);
-  d31s16 = vdup_n_s16(cospi_24_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
 
   q12s32 = vmull_s16(d14s16, d31s16);
   q5s32 = vmull_s16(d15s16, d31s16);
@@ -1057,7 +1057,7 @@ void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
   d26s16 = vget_low_s16(q13s16);
   d27s16 = vget_high_s16(q13s16);
 
-  d14s16 = vdup_n_s16(cospi_16_64);
+  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
   q3s32 = vmull_s16(d26s16, d14s16);
   q4s32 = vmull_s16(d27s16, d14s16);
   q0s32 = vmull_s16(d20s16, d14s16);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm
deleted file mode 100644
index 96d276b4d14..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm
+++ /dev/null
@@ -1,144 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-    EXPORT  |vpx_idct32x32_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ;TODO(hkuang): put the following macros in a seperate
-    ;file so other idct function could also use them.
-    MACRO
-    LD_16x8          $src, $stride
-    vld1.8           {q8}, [$src], $stride
-    vld1.8           {q9}, [$src], $stride
-    vld1.8           {q10}, [$src], $stride
-    vld1.8           {q11}, [$src], $stride
-    vld1.8           {q12}, [$src], $stride
-    vld1.8           {q13}, [$src], $stride
-    vld1.8           {q14}, [$src], $stride
-    vld1.8           {q15}, [$src], $stride
-    MEND
-
-    MACRO
-    ADD_DIFF_16x8    $diff
-    vqadd.u8         q8, q8, $diff
-    vqadd.u8         q9, q9, $diff
-    vqadd.u8         q10, q10, $diff
-    vqadd.u8         q11, q11, $diff
-    vqadd.u8         q12, q12, $diff
-    vqadd.u8         q13, q13, $diff
-    vqadd.u8         q14, q14, $diff
-    vqadd.u8         q15, q15, $diff
-    MEND
-
-    MACRO
-    SUB_DIFF_16x8    $diff
-    vqsub.u8         q8, q8, $diff
-    vqsub.u8         q9, q9, $diff
-    vqsub.u8         q10, q10, $diff
-    vqsub.u8         q11, q11, $diff
-    vqsub.u8         q12, q12, $diff
-    vqsub.u8         q13, q13, $diff
-    vqsub.u8         q14, q14, $diff
-    vqsub.u8         q15, q15, $diff
-    MEND
-
-    MACRO
-    ST_16x8          $dst, $stride
-    vst1.8           {q8}, [$dst], $stride
-    vst1.8           {q9}, [$dst], $stride
-    vst1.8           {q10},[$dst], $stride
-    vst1.8           {q11},[$dst], $stride
-    vst1.8           {q12},[$dst], $stride
-    vst1.8           {q13},[$dst], $stride
-    vst1.8           {q14},[$dst], $stride
-    vst1.8           {q15},[$dst], $stride
-    MEND
-
-;void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
-;                              int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-
-|vpx_idct32x32_1_add_neon| PROC
-    push             {lr}
-    pld              [r1]
-    add              r3, r1, #16               ; r3 dest + 16 for second loop
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 6)
-    add              r0, r0, #32               ; + (1 <<((6) - 1))
-    asrs             r0, r0, #6                ; >> 6
-    bge              diff_positive_32_32
-
-diff_negative_32_32
-    neg              r0, r0
-    usat             r0, #8, r0
-    vdup.u8          q0, r0
-    mov              r0, #4
-
-diff_negative_32_32_loop
-    sub              r0, #1
-    LD_16x8          r1, r2
-    SUB_DIFF_16x8    q0
-    ST_16x8          r12, r2
-
-    LD_16x8          r1, r2
-    SUB_DIFF_16x8    q0
-    ST_16x8          r12, r2
-    cmp              r0, #2
-    moveq            r1, r3
-    moveq            r12, r3
-    cmp              r0, #0
-    bne              diff_negative_32_32_loop
-    pop              {pc}
-
-diff_positive_32_32
-    usat             r0, #8, r0
-    vdup.u8          q0, r0
-    mov              r0, #4
-
-diff_positive_32_32_loop
-    sub              r0, #1
-    LD_16x8          r1, r2
-    ADD_DIFF_16x8    q0
-    ST_16x8          r12, r2
-
-    LD_16x8          r1, r2
-    ADD_DIFF_16x8    q0
-    ST_16x8          r12, r2
-    cmp              r0, #2
-    moveq            r1, r3
-    moveq            r12, r3
-    cmp              r0, #0
-    bne              diff_positive_32_32_loop
-    pop              {pc}
-
-    ENDP             ; |vpx_idct32x32_1_add_neon|
-    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
index 9dfdf8d6965..6be4b01229b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -94,7 +94,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
   uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
   int i, j, dest_stride8;
   uint8_t *d;
-  int16_t a1, cospi_16_64 = 11585;
+  int16_t a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
 
   out = dct_const_round_shift(out * cospi_16_64);
@@ -103,7 +103,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
   dest_stride8 = dest_stride * 8;
   if (a1 >= 0) {  // diff_positive_32_32
     a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8(a1);
+    q0u8 = vdupq_n_u8((uint8_t)a1);
     for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
       d = dest;
       for (j = 0; j < 4; j++) {
@@ -119,7 +119,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
   } else {  // diff_negative_32_32
     a1 = -a1;
     a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8(a1);
+    q0u8 = vdupq_n_u8((uint8_t)a1);
     for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
       d = dest;
       for (j = 0; j < 4; j++) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
new file mode 100644
index 00000000000..ebec9df54ad
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -0,0 +1,519 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the  _34_ variant. Since it only uses values from
+// the top left 8x8 it can safely assume all the remaining values are 0 and skip
+// an awful lot of calculations. In fact, only the first 6 columns make the cut.
+// None of the elements in the 7th or 8th column are used so it skips any calls
+// to input[67] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 8x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
+// coefficients as follows:
+//    0  1  2  3  4  5  6  7
+// 0  0  2  5 10 17 25
+// 1  1  4  8 15 22 30
+// 2  3  7 12 18 28
+// 3  6 11 16 23 31
+// 4  9 14 19 29
+// 5 13 20 26
+// 6 21 27 33
+// 7 24 32
+static void idct32_6_neon(const int16_t *input, int16_t *output) {
+  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
+  int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
+      s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
+      s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30,
+      s1_31;
+  int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10,
+      s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20,
+      s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30,
+      s2_31;
+  int16x8_t s3_24, s3_25, s3_26, s3_27;
+
+  load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5,
+                             &in6, &in7);
+
+  // stage 1
+  // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
+  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+  // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
+  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+  // stage 2
+  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+  // stage 3
+  s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+  s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+  s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+                                                   cospi_28_64);
+  s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+                                                   cospi_4_64);
+
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
+                                                   cospi_12_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
+                                                   cospi_20_64);
+
+  s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+                                                   -cospi_20_64);
+  s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+                                                   cospi_12_64);
+
+  // stage 4
+  s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+
+  s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+                                                  cospi_24_64);
+  s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+                                                   cospi_8_64);
+
+  s2_20 = vsubq_s16(s1_23, s1_20);
+  s2_21 = vsubq_s16(s1_22, s1_21);
+  s2_22 = vaddq_s16(s1_21, s1_22);
+  s2_23 = vaddq_s16(s1_20, s1_23);
+  s2_24 = vaddq_s16(s1_24, s1_27);
+  s2_25 = vaddq_s16(s1_25, s1_26);
+  s2_26 = vsubq_s16(s1_25, s1_26);
+  s2_27 = vsubq_s16(s1_24, s1_27);
+
+  // stage 5
+  s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
+  s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
+
+  s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_17, -cospi_8_64, s1_30,
+                                                   cospi_24_64);
+  s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_17, cospi_24_64, s1_30,
+                                                   cospi_8_64);
+
+  s1_19 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_8_64, s1_31,
+                                                   cospi_24_64);
+  s1_28 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_24_64, s1_31,
+                                                   cospi_8_64);
+
+  s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
+                                                   -cospi_8_64);
+  s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
+                                                   cospi_24_64);
+
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
+                                                   -cospi_8_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
+                                                   cospi_24_64);
+
+  // stage 6
+  s2_0 = vaddq_s16(s1_0, s1_7);
+  s2_1 = vaddq_s16(s1_0, s1_6);
+  s2_2 = vaddq_s16(s1_0, s1_5);
+  s2_3 = vaddq_s16(s1_0, s1_4);
+  s2_4 = vsubq_s16(s1_0, s1_4);
+  s2_5 = vsubq_s16(s1_0, s1_5);
+  s2_6 = vsubq_s16(s1_0, s1_6);
+  s2_7 = vsubq_s16(s1_0, s1_7);
+
+  s2_10 = sub_multiply_shift_and_narrow_s16(s2_14, s2_9, cospi_16_64);
+  s2_13 = add_multiply_shift_and_narrow_s16(s2_9, s2_14, cospi_16_64);
+
+  s2_11 = sub_multiply_shift_and_narrow_s16(s2_15, s2_8, cospi_16_64);
+  s2_12 = add_multiply_shift_and_narrow_s16(s2_8, s2_15, cospi_16_64);
+
+  s2_16 = vaddq_s16(s1_16, s2_23);
+  s2_17 = vaddq_s16(s1_17, s2_22);
+  s2_18 = vaddq_s16(s1_18, s1_21);
+  s2_19 = vaddq_s16(s1_19, s1_20);
+  s2_20 = vsubq_s16(s1_19, s1_20);
+  s2_21 = vsubq_s16(s1_18, s1_21);
+  s2_22 = vsubq_s16(s1_17, s2_22);
+  s2_23 = vsubq_s16(s1_16, s2_23);
+
+  s3_24 = vsubq_s16(s1_31, s2_24);
+  s3_25 = vsubq_s16(s1_30, s2_25);
+  s3_26 = vsubq_s16(s1_29, s1_26);
+  s3_27 = vsubq_s16(s1_28, s1_27);
+  s2_28 = vaddq_s16(s1_27, s1_28);
+  s2_29 = vaddq_s16(s1_26, s1_29);
+  s2_30 = vaddq_s16(s2_25, s1_30);
+  s2_31 = vaddq_s16(s2_24, s1_31);
+
+  // stage 7
+  s1_0 = vaddq_s16(s2_0, s2_15);
+  s1_1 = vaddq_s16(s2_1, s2_14);
+  s1_2 = vaddq_s16(s2_2, s2_13);
+  s1_3 = vaddq_s16(s2_3, s2_12);
+  s1_4 = vaddq_s16(s2_4, s2_11);
+  s1_5 = vaddq_s16(s2_5, s2_10);
+  s1_6 = vaddq_s16(s2_6, s2_9);
+  s1_7 = vaddq_s16(s2_7, s2_8);
+  s1_8 = vsubq_s16(s2_7, s2_8);
+  s1_9 = vsubq_s16(s2_6, s2_9);
+  s1_10 = vsubq_s16(s2_5, s2_10);
+  s1_11 = vsubq_s16(s2_4, s2_11);
+  s1_12 = vsubq_s16(s2_3, s2_12);
+  s1_13 = vsubq_s16(s2_2, s2_13);
+  s1_14 = vsubq_s16(s2_1, s2_14);
+  s1_15 = vsubq_s16(s2_0, s2_15);
+
+  s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
+  s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
+
+  s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
+  s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
+
+  s1_22 = sub_multiply_shift_and_narrow_s16(s3_25, s2_22, cospi_16_64);
+  s1_25 = add_multiply_shift_and_narrow_s16(s2_22, s3_25, cospi_16_64);
+
+  s1_23 = sub_multiply_shift_and_narrow_s16(s3_24, s2_23, cospi_16_64);
+  s1_24 = add_multiply_shift_and_narrow_s16(s2_23, s3_24, cospi_16_64);
+
+  // final stage
+  vst1q_s16(output, vaddq_s16(s1_0, s2_31));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_1, s2_30));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_2, s2_29));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_3, s2_28));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_4, s1_27));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_5, s1_26));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_6, s1_25));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_7, s1_24));
+  output += 8;
+
+  vst1q_s16(output, vaddq_s16(s1_8, s1_23));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_9, s1_22));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_10, s1_21));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_11, s1_20));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_12, s2_19));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_13, s2_18));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_14, s2_17));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_15, s2_16));
+  output += 8;
+
+  vst1q_s16(output, vsubq_s16(s1_15, s2_16));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_14, s2_17));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_13, s2_18));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_12, s2_19));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_11, s1_20));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_10, s1_21));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_9, s1_22));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_8, s1_23));
+  output += 8;
+
+  vst1q_s16(output, vsubq_s16(s1_7, s1_24));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_6, s1_25));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_5, s1_26));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_4, s1_27));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_3, s2_28));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_2, s2_29));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_1, s2_30));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_0, s2_31));
+}
+
+static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
+  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
+  int16x8_t out0, out1, out2, out3, out4, out5, out6, out7;
+  int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
+      s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
+      s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30,
+      s1_31;
+  int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10,
+      s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20,
+      s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30,
+      s2_31;
+  int16x8_t s3_24, s3_25, s3_26, s3_27;
+
+  load_and_transpose_s16_8x8(input, 8, &in0, &in1, &in2, &in3, &in4, &in5, &in6,
+                             &in7);
+
+  // stage 1
+  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+  // Different for _8_
+  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+
+  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+  // stage 2
+  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+
+  // stage 3
+  s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+  s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+  s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+                                                   cospi_28_64);
+  s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+                                                   cospi_4_64);
+
+  // Different for _8_
+  s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_28_64, s1_28,
+                                                   -cospi_4_64);
+  s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_4_64, s1_28,
+                                                   cospi_28_64);
+
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
+                                                   cospi_12_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
+                                                   cospi_20_64);
+
+  s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+                                                   -cospi_20_64);
+  s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+                                                   cospi_12_64);
+
+  // stage 4
+  s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+
+  s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+                                                  cospi_24_64);
+  s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+                                                   cospi_8_64);
+
+  s2_10 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_24_64, s2_12,
+                                                   -cospi_8_64);
+  s2_13 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_8_64, s2_12,
+                                                   cospi_24_64);
+
+  s2_16 = vaddq_s16(s1_16, s1_19);
+
+  s2_17 = vaddq_s16(s1_17, s1_18);
+  s2_18 = vsubq_s16(s1_17, s1_18);
+
+  s2_19 = vsubq_s16(s1_16, s1_19);
+
+  s2_20 = vsubq_s16(s1_23, s1_20);
+  s2_21 = vsubq_s16(s1_22, s1_21);
+
+  s2_22 = vaddq_s16(s1_21, s1_22);
+  s2_23 = vaddq_s16(s1_20, s1_23);
+
+  s2_24 = vaddq_s16(s1_24, s1_27);
+  s2_25 = vaddq_s16(s1_25, s1_26);
+  s2_26 = vsubq_s16(s1_25, s1_26);
+  s2_27 = vsubq_s16(s1_24, s1_27);
+
+  s2_28 = vsubq_s16(s1_31, s1_28);
+  s2_29 = vsubq_s16(s1_30, s1_29);
+  s2_30 = vaddq_s16(s1_29, s1_30);
+  s2_31 = vaddq_s16(s1_28, s1_31);
+
+  // stage 5
+  s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
+  s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
+
+  s1_8 = vaddq_s16(s2_8, s2_11);
+  s1_9 = vaddq_s16(s2_9, s2_10);
+  s1_10 = vsubq_s16(s2_9, s2_10);
+  s1_11 = vsubq_s16(s2_8, s2_11);
+  s1_12 = vsubq_s16(s2_15, s2_12);
+  s1_13 = vsubq_s16(s2_14, s2_13);
+  s1_14 = vaddq_s16(s2_13, s2_14);
+  s1_15 = vaddq_s16(s2_12, s2_15);
+
+  s1_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_8_64, s2_29,
+                                                   cospi_24_64);
+  s1_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, cospi_24_64, s2_29,
+                                                   cospi_8_64);
+
+  s1_19 = multiply_accumulate_shift_and_narrow_s16(s2_19, -cospi_8_64, s2_28,
+                                                   cospi_24_64);
+  s1_28 = multiply_accumulate_shift_and_narrow_s16(s2_19, cospi_24_64, s2_28,
+                                                   cospi_8_64);
+
+  s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
+                                                   -cospi_8_64);
+  s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
+                                                   cospi_24_64);
+
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
+                                                   -cospi_8_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
+                                                   cospi_24_64);
+
+  // stage 6
+  s2_0 = vaddq_s16(s1_0, s1_7);
+  s2_1 = vaddq_s16(s1_0, s1_6);
+  s2_2 = vaddq_s16(s1_0, s1_5);
+  s2_3 = vaddq_s16(s1_0, s1_4);
+  s2_4 = vsubq_s16(s1_0, s1_4);
+  s2_5 = vsubq_s16(s1_0, s1_5);
+  s2_6 = vsubq_s16(s1_0, s1_6);
+  s2_7 = vsubq_s16(s1_0, s1_7);
+
+  s2_10 = sub_multiply_shift_and_narrow_s16(s1_13, s1_10, cospi_16_64);
+  s2_13 = add_multiply_shift_and_narrow_s16(s1_10, s1_13, cospi_16_64);
+
+  s2_11 = sub_multiply_shift_and_narrow_s16(s1_12, s1_11, cospi_16_64);
+  s2_12 = add_multiply_shift_and_narrow_s16(s1_11, s1_12, cospi_16_64);
+
+  s1_16 = vaddq_s16(s2_16, s2_23);
+  s1_17 = vaddq_s16(s2_17, s2_22);
+  s2_18 = vaddq_s16(s1_18, s1_21);
+  s2_19 = vaddq_s16(s1_19, s1_20);
+  s2_20 = vsubq_s16(s1_19, s1_20);
+  s2_21 = vsubq_s16(s1_18, s1_21);
+  s1_22 = vsubq_s16(s2_17, s2_22);
+  s1_23 = vsubq_s16(s2_16, s2_23);
+
+  s3_24 = vsubq_s16(s2_31, s2_24);
+  s3_25 = vsubq_s16(s2_30, s2_25);
+  s3_26 = vsubq_s16(s1_29, s1_26);
+  s3_27 = vsubq_s16(s1_28, s1_27);
+  s2_28 = vaddq_s16(s1_27, s1_28);
+  s2_29 = vaddq_s16(s1_26, s1_29);
+  s2_30 = vaddq_s16(s2_25, s2_30);
+  s2_31 = vaddq_s16(s2_24, s2_31);
+
+  // stage 7
+  s1_0 = vaddq_s16(s2_0, s1_15);
+  s1_1 = vaddq_s16(s2_1, s1_14);
+  s1_2 = vaddq_s16(s2_2, s2_13);
+  s1_3 = vaddq_s16(s2_3, s2_12);
+  s1_4 = vaddq_s16(s2_4, s2_11);
+  s1_5 = vaddq_s16(s2_5, s2_10);
+  s1_6 = vaddq_s16(s2_6, s1_9);
+  s1_7 = vaddq_s16(s2_7, s1_8);
+  s1_8 = vsubq_s16(s2_7, s1_8);
+  s1_9 = vsubq_s16(s2_6, s1_9);
+  s1_10 = vsubq_s16(s2_5, s2_10);
+  s1_11 = vsubq_s16(s2_4, s2_11);
+  s1_12 = vsubq_s16(s2_3, s2_12);
+  s1_13 = vsubq_s16(s2_2, s2_13);
+  s1_14 = vsubq_s16(s2_1, s1_14);
+  s1_15 = vsubq_s16(s2_0, s1_15);
+
+  s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
+  s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
+
+  s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
+  s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
+
+  s2_22 = sub_multiply_shift_and_narrow_s16(s3_25, s1_22, cospi_16_64);
+  s1_25 = add_multiply_shift_and_narrow_s16(s1_22, s3_25, cospi_16_64);
+
+  s2_23 = sub_multiply_shift_and_narrow_s16(s3_24, s1_23, cospi_16_64);
+  s1_24 = add_multiply_shift_and_narrow_s16(s1_23, s3_24, cospi_16_64);
+
+  // final stage
+  out0 = vaddq_s16(s1_0, s2_31);
+  out1 = vaddq_s16(s1_1, s2_30);
+  out2 = vaddq_s16(s1_2, s2_29);
+  out3 = vaddq_s16(s1_3, s2_28);
+  out4 = vaddq_s16(s1_4, s1_27);
+  out5 = vaddq_s16(s1_5, s1_26);
+  out6 = vaddq_s16(s1_6, s1_25);
+  out7 = vaddq_s16(s1_7, s1_24);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
+                       stride);
+
+  out0 = vaddq_s16(s1_8, s2_23);
+  out1 = vaddq_s16(s1_9, s2_22);
+  out2 = vaddq_s16(s1_10, s1_21);
+  out3 = vaddq_s16(s1_11, s1_20);
+  out4 = vaddq_s16(s1_12, s2_19);
+  out5 = vaddq_s16(s1_13, s2_18);
+  out6 = vaddq_s16(s1_14, s1_17);
+  out7 = vaddq_s16(s1_15, s1_16);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (8 * stride), stride);
+
+  out0 = vsubq_s16(s1_15, s1_16);
+  out1 = vsubq_s16(s1_14, s1_17);
+  out2 = vsubq_s16(s1_13, s2_18);
+  out3 = vsubq_s16(s1_12, s2_19);
+  out4 = vsubq_s16(s1_11, s1_20);
+  out5 = vsubq_s16(s1_10, s1_21);
+  out6 = vsubq_s16(s1_9, s2_22);
+  out7 = vsubq_s16(s1_8, s2_23);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (16 * stride), stride);
+
+  out0 = vsubq_s16(s1_7, s1_24);
+  out1 = vsubq_s16(s1_6, s1_25);
+  out2 = vsubq_s16(s1_5, s1_26);
+  out3 = vsubq_s16(s1_4, s1_27);
+  out4 = vsubq_s16(s1_3, s2_28);
+  out5 = vsubq_s16(s1_2, s2_29);
+  out6 = vsubq_s16(s1_1, s2_30);
+  out7 = vsubq_s16(s1_0, s2_31);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (24 * stride), stride);
+}
+
+void vpx_idct32x32_34_add_neon(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  int i;
+  int16_t temp[32 * 8];
+  int16_t *t = temp;
+
+  idct32_6_neon(input, t);
+
+  for (i = 0; i < 32; i += 8) {
+    idct32_8_neon(t, dest, stride);
+    t += (8 * 8);
+    dest += 8;
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm
deleted file mode 100644
index 7483ee77e18..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm
+++ /dev/null
@@ -1,1299 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-;TODO(cd): adjust these constant to be able to use vqdmulh for faster
-;          dct_const_round_shift(a * b) within butterfly calculations.
-cospi_1_64  EQU 16364
-cospi_2_64  EQU 16305
-cospi_3_64  EQU 16207
-cospi_4_64  EQU 16069
-cospi_5_64  EQU 15893
-cospi_6_64  EQU 15679
-cospi_7_64  EQU 15426
-cospi_8_64  EQU 15137
-cospi_9_64  EQU 14811
-cospi_10_64 EQU 14449
-cospi_11_64 EQU 14053
-cospi_12_64 EQU 13623
-cospi_13_64 EQU 13160
-cospi_14_64 EQU 12665
-cospi_15_64 EQU 12140
-cospi_16_64 EQU 11585
-cospi_17_64 EQU 11003
-cospi_18_64 EQU 10394
-cospi_19_64 EQU  9760
-cospi_20_64 EQU  9102
-cospi_21_64 EQU  8423
-cospi_22_64 EQU  7723
-cospi_23_64 EQU  7005
-cospi_24_64 EQU  6270
-cospi_25_64 EQU  5520
-cospi_26_64 EQU  4756
-cospi_27_64 EQU  3981
-cospi_28_64 EQU  3196
-cospi_29_64 EQU  2404
-cospi_30_64 EQU  1606
-cospi_31_64 EQU   804
-
-
-    EXPORT  |vpx_idct32x32_1024_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    AREA     Block, CODE, READONLY
-
-    ; --------------------------------------------------------------------------
-    ; Load from transposed_buffer
-    ;   q13 = transposed_buffer[first_offset]
-    ;   q14 = transposed_buffer[second_offset]
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   transposed_buffer must be passed in. use 0 for first use.
-    MACRO
-    LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
-    ; address calculation with proper stride and loading
-    add r0, #($first_offset  - $prev_offset )*8*2
-    vld1.s16        {q14}, [r0]
-    add r0, #($second_offset - $first_offset)*8*2
-    vld1.s16        {q13}, [r0]
-    ; (used) two registers (q14, q13)
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Load from output (used as temporary storage)
-    ;   reg1 = output[first_offset]
-    ;   reg2 = output[second_offset]
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   output, whether reading or storing) must be passed in. use 0 for first
-    ;   use.
-    MACRO
-    LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
-    ; address calculation with proper stride and loading
-    add r1, #($first_offset  - $prev_offset )*32*2
-    vld1.s16        {$reg1}, [r1]
-    add r1, #($second_offset - $first_offset)*32*2
-    vld1.s16        {$reg2}, [r1]
-    ; (used) two registers ($reg1, $reg2)
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Store into output (sometimes as as temporary storage)
-    ;   output[first_offset] = reg1
-    ;   output[second_offset] = reg2
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   output, whether reading or storing) must be passed in. use 0 for first
-    ;   use.
-    MACRO
-    STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
-    ; address calculation with proper stride and storing
-    add r1, #($first_offset  - $prev_offset )*32*2
-    vst1.16 {$reg1}, [r1]
-    add r1, #($second_offset - $first_offset)*32*2
-    vst1.16 {$reg2}, [r1]
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q6-q9 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_CENTER_RESULTS
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d8}, [r10], r2
-    vld1.s16        {d11}, [r9], r11
-    vld1.s16        {d9}, [r10]
-    vld1.s16        {d10}, [r9]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q8, q8, #6
-    vrshr.s16       q9, q9, #6
-    vrshr.s16       q6, q6, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q7, q7, d9
-    vaddw.u8        q8, q8, d10
-    vaddw.u8        q9, q9, d11
-    vaddw.u8        q6, q6, d8
-    ; clip pixel
-    vqmovun.s16     d9,  q7
-    vqmovun.s16     d10, q8
-    vqmovun.s16     d11, q9
-    vqmovun.s16     d8,  q6
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d9}, [r10], r11
-    vst1.16         {d10}, [r9], r2
-    vst1.16         {d8}, [r10]
-    vst1.16         {d11}, [r9]
-    ; update pointers (by dest_stride * 2)
-    sub r9,  r9,  r2, lsl #1
-    add r10, r10, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q6-q9 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_CENTER_RESULTS_LAST
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d8}, [r10], r2
-    vld1.s16        {d11}, [r9], r11
-    vld1.s16        {d9}, [r10]
-    vld1.s16        {d10}, [r9]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q8, q8, #6
-    vrshr.s16       q9, q9, #6
-    vrshr.s16       q6, q6, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q7, q7, d9
-    vaddw.u8        q8, q8, d10
-    vaddw.u8        q9, q9, d11
-    vaddw.u8        q6, q6, d8
-    ; clip pixel
-    vqmovun.s16     d9,  q7
-    vqmovun.s16     d10, q8
-    vqmovun.s16     d11, q9
-    vqmovun.s16     d8,  q6
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d9}, [r10], r11
-    vst1.16         {d10}, [r9], r2
-    vst1.16         {d8}, [r10]!
-    vst1.16         {d11}, [r9]!
-    ; update pointers (by dest_stride * 2)
-    sub r9,  r9,  r2, lsl #1
-    add r10, r10, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q4-q7 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_EXTREME_RESULTS
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d4}, [r7], r2
-    vld1.s16        {d7}, [r6], r11
-    vld1.s16        {d5}, [r7]
-    vld1.s16        {d6}, [r6]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q5, q5, #6
-    vrshr.s16       q6, q6, #6
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q4, q4, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q5, q5, d5
-    vaddw.u8        q6, q6, d6
-    vaddw.u8        q7, q7, d7
-    vaddw.u8        q4, q4, d4
-    ; clip pixel
-    vqmovun.s16     d5, q5
-    vqmovun.s16     d6, q6
-    vqmovun.s16     d7, q7
-    vqmovun.s16     d4, q4
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d5}, [r7], r11
-    vst1.16         {d6}, [r6], r2
-    vst1.16         {d7}, [r6]
-    vst1.16         {d4}, [r7]
-    ; update pointers (by dest_stride * 2)
-    sub r6, r6, r2, lsl #1
-    add r7, r7, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q4-q7 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_EXTREME_RESULTS_LAST
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d4}, [r7], r2
-    vld1.s16        {d7}, [r6], r11
-    vld1.s16        {d5}, [r7]
-    vld1.s16        {d6}, [r6]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q5, q5, #6
-    vrshr.s16       q6, q6, #6
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q4, q4, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q5, q5, d5
-    vaddw.u8        q6, q6, d6
-    vaddw.u8        q7, q7, d7
-    vaddw.u8        q4, q4, d4
-    ; clip pixel
-    vqmovun.s16     d5, q5
-    vqmovun.s16     d6, q6
-    vqmovun.s16     d7, q7
-    vqmovun.s16     d4, q4
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d5}, [r7], r11
-    vst1.16         {d6}, [r6], r2
-    vst1.16         {d7}, [r6]!
-    vst1.16         {d4}, [r7]!
-    ; update pointers (by dest_stride * 2)
-    sub r6, r6, r2, lsl #1
-    add r7, r7, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Touches q8-q12, q15 (q13-q14 are preserved)
-    ; valid output registers are anything but q8-q11
-    MACRO
-    DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    ; TODO(cd): have special case to re-use constants when they are similar for
-    ;           consecutive butterflies
-    ; TODO(cd): have special case when both constants are the same, do the
-    ;           additions/subtractions before the multiplies.
-    ; generate the constants
-    ;   generate scalar constants
-    mov             r8,  #$first_constant  & 0xFF00
-    mov             r12, #$second_constant & 0xFF00
-    add             r8,  #$first_constant  & 0x00FF
-    add             r12, #$second_constant & 0x00FF
-    ;   generate vector constants
-    vdup.16         d30, r8
-    vdup.16         d31, r12
-    ; (used) two for inputs (regA-regD), one for constants (q15)
-    ; do some multiplications (ordered for maximum latency hiding)
-    vmull.s16 q8,  $regC, d30
-    vmull.s16 q10, $regA, d31
-    vmull.s16 q9,  $regD, d30
-    vmull.s16 q11, $regB, d31
-    vmull.s16 q12, $regC, d31
-    ; (used) five for intermediate (q8-q12), one for constants (q15)
-    ; do some addition/subtractions (to get back two register)
-    vsub.s32  q8, q8, q10
-    vsub.s32  q9, q9, q11
-    ; do more multiplications (ordered for maximum latency hiding)
-    vmull.s16 q10, $regD, d31
-    vmull.s16 q11, $regA, d30
-    vmull.s16 q15, $regB, d30
-    ; (used) six for intermediate (q8-q12, q15)
-    ; do more addition/subtractions
-    vadd.s32  q11, q12, q11
-    vadd.s32  q10, q10, q15
-    ; (used) four for intermediate (q8-q11)
-    ; dct_const_round_shift
-    vqrshrn.s32 $reg1, q8,  #14
-    vqrshrn.s32 $reg2, q9,  #14
-    vqrshrn.s32 $reg3, q11, #14
-    vqrshrn.s32 $reg4, q10, #14
-    ; (used) two for results, well four d registers
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Touches q8-q12, q15 (q13-q14 are preserved)
-    ; valid output registers are anything but q8-q11
-    MACRO
-    DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    MEND
-    ; --------------------------------------------------------------------------
-
-;void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
-;
-;   r0  int16_t *input,
-;   r1  uint8_t *dest,
-;   r2  int dest_stride)
-; loop counters
-;   r4  bands loop counter
-;   r5  pass loop counter
-;   r8  transpose loop counter
-; combine-add pointers
-;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)
-;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)
-;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
-;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
-
-|vpx_idct32x32_1024_add_neon| PROC
-    ; This function does one pass of idct32x32 transform.
-    ;
-    ; This is done by transposing the input and then doing a 1d transform on
-    ; columns. In the first pass, the transposed columns are the original
-    ; rows. In the second pass, after the transposition, the colums are the
-    ; original columns.
-    ; The 1d transform is done by looping over bands of eight columns (the
-    ; idct32_bands loop). For each band, the transform input transposition
-    ; is done on demand, one band of four 8x8 matrices at a time. The four
-    ; matrices are transposed by pairs (the idct32_transpose_pair loop).
-    push  {r4-r11}
-    vpush {d8-d15}
-    ; stack operation
-    ; internal buffer used to transpose 8 lines into before transforming them
-    ;   int16_t transpose_buffer[32 * 8];
-    ;   at sp + [4096, 4607]
-    ; results of the first pass (transpose and transform rows)
-    ;   int16_t pass1[32 * 32];
-    ;   at sp + [0, 2047]
-    ; results of the second pass (transpose and transform columns)
-    ;   int16_t pass2[32 * 32];
-    ;   at sp + [2048, 4095]
-    sub sp, sp, #512+2048+2048
-
-    ; r6  = dest + 31 * dest_stride
-    ; r7  = dest +  0 * dest_stride
-    ; r9  = dest + 15 * dest_stride
-    ; r10 = dest + 16 * dest_stride
-    rsb r6,  r2, r2, lsl #5
-    rsb r9,  r2, r2, lsl #4
-    add r10, r1, r2, lsl #4
-    mov r7, r1
-    add r6, r6, r1
-    add r9, r9, r1
-    ; r11 = -dest_stride
-    neg r11, r2
-    ; r3 = input
-    mov r3, r0
-    ; parameters for first pass
-      ; r0 = transpose_buffer[32 * 8]
-    add r0, sp, #4096
-      ; r1 = pass1[32 * 32]
-    mov r1, sp
-
-    mov r5, #0          ; initialize pass loop counter
-idct32_pass_loop
-    mov r4, #4          ; initialize bands loop counter
-idct32_bands_loop
-    mov r8, #2          ; initialize transpose loop counter
-idct32_transpose_pair_loop
-    ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
-    ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
-    ; adjusted to 32 because of the two post-increments.
-    vld1.s16        {q8},  [r3]!
-    vld1.s16        {q0},  [r3]!
-    add r3, #32
-    vld1.s16        {q9},  [r3]!
-    vld1.s16        {q1},  [r3]!
-    add r3, #32
-    vld1.s16        {q10}, [r3]!
-    vld1.s16        {q2},  [r3]!
-    add r3, #32
-    vld1.s16        {q11}, [r3]!
-    vld1.s16        {q3},  [r3]!
-    add r3, #32
-    vld1.s16        {q12}, [r3]!
-    vld1.s16        {q4},  [r3]!
-    add r3, #32
-    vld1.s16        {q13}, [r3]!
-    vld1.s16        {q5},  [r3]!
-    add r3, #32
-    vld1.s16        {q14}, [r3]!
-    vld1.s16        {q6},  [r3]!
-    add r3, #32
-    vld1.s16        {q15}, [r3]!
-    vld1.s16        {q7},  [r3]!
-
-    ; Transpose the two 8x8 16bit data matrices.
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vswp            d1,  d8
-    vswp            d7,  d14
-    vswp            d5,  d12
-    vswp            d3,  d10
-    vtrn.32         q8,  q10
-    vtrn.32         q9,  q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.32         q0,  q2
-    vtrn.32         q1,  q3
-    vtrn.32         q4,  q6
-    vtrn.32         q5,  q7
-    vtrn.16         q8,  q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    vtrn.16         q0,  q1
-    vtrn.16         q2,  q3
-    vtrn.16         q4,  q5
-    vtrn.16         q6,  q7
-
-    ; Store both matrices after each other. There is a stride of 32, which
-    ; adjusts to nothing because of the post-increments.
-    vst1.16        {q8},  [r0]!
-    vst1.16        {q9},  [r0]!
-    vst1.16        {q10}, [r0]!
-    vst1.16        {q11}, [r0]!
-    vst1.16        {q12}, [r0]!
-    vst1.16        {q13}, [r0]!
-    vst1.16        {q14}, [r0]!
-    vst1.16        {q15}, [r0]!
-    vst1.16        {q0},  [r0]!
-    vst1.16        {q1},  [r0]!
-    vst1.16        {q2},  [r0]!
-    vst1.16        {q3},  [r0]!
-    vst1.16        {q4},  [r0]!
-    vst1.16        {q5},  [r0]!
-    vst1.16        {q6},  [r0]!
-    vst1.16        {q7},  [r0]!
-
-    ; increment pointers by adjusted stride (not necessary for r0/out)
-    ;   go back by 7*32 for the seven lines moved fully by read and add
-    ;   go back by 32 for the eigth line only read
-    ;   advance by 16*2 to go the next pair
-    sub r3,  r3,  #7*32*2 + 32 - 16*2
-    ; transpose pair loop processing
-    subs r8, r8, #1
-    bne idct32_transpose_pair_loop
-
-    ; restore r0/input to its original value
-    sub r0, r0, #32*8*2
-
-    ; Instead of doing the transforms stage by stage, it is done by loading
-    ; some input values and doing as many stages as possible to minimize the
-    ; storing/loading of intermediate results. To fit within registers, the
-    ; final coefficients are cut into four blocks:
-    ; BLOCK A: 16-19,28-31
-    ; BLOCK B: 20-23,24-27
-    ; BLOCK C: 8-10,11-15
-    ; BLOCK D: 0-3,4-7
-    ; Blocks A and C are straight calculation through the various stages. In
-    ; block B, further calculations are performed using the results from
-    ; block A. In block D, further calculations are performed using the results
-    ; from block C and then the final calculations are done using results from
-    ; block A and B which have been combined at the end of block B.
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK A: 16-19,28-31
-    ; --------------------------------------------------------------------------
-    ; generate 16,17,30,31
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;
-    ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;
-    ;step1b[16][i] = dct_const_round_shift(temp1);
-    ;step1b[31][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 0, 1, 31
-    DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
-    ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
-    ;step1b[17][i] = dct_const_round_shift(temp1);
-    ;step1b[30][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 31, 17, 15
-    DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[16] =  step1b[16][i] + step1b[17][i];
-    ;step2[17] =  step1b[16][i] - step1b[17][i];
-    ;step2[30] = -step1b[30][i] + step1b[31][i];
-    ;step2[31] =  step1b[30][i] + step1b[31][i];
-    vadd.s16  q4, q0, q1
-    vsub.s16  q13, q0, q1
-    vadd.s16  q6, q2, q3
-    vsub.s16  q14, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
-    ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;
-    ;step3[17] = dct_const_round_shift(temp1);
-    ;step3[30] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; generate 18,19,28,29
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
-    ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;
-    ;step1b[18][i] = dct_const_round_shift(temp1);
-    ;step1b[29][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 15, 9, 23
-    DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;
-    ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
-    ;step1b[19][i] = dct_const_round_shift(temp1);
-    ;step1b[28][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 23, 25, 7
-    DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[18] = -step1b[18][i] + step1b[19][i];
-    ;step2[19] =  step1b[18][i] + step1b[19][i];
-    ;step2[28] =  step1b[28][i] + step1b[29][i];
-    ;step2[29] =  step1b[28][i] - step1b[29][i];
-    vsub.s16  q13, q3, q2
-    vadd.s16  q3,  q3, q2
-    vsub.s16  q14, q1, q0
-    vadd.s16  q2,  q1, q0
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);
-    ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
-    ;step3[29] = dct_const_round_shift(temp1);
-    ;step3[18] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
-    ; --------------------------------------------------------------------------
-    ; combine 16-19,28-31
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[16] = step1b[16][i] + step1b[19][i];
-    ;step1[17] = step1b[17][i] + step1b[18][i];
-    ;step1[18] = step1b[17][i] - step1b[18][i];
-    ;step1[29] = step1b[30][i] - step1b[29][i];
-    ;step1[30] = step1b[30][i] + step1b[29][i];
-    ;step1[31] = step1b[31][i] + step1b[28][i];
-    vadd.s16  q8,  q4, q2
-    vadd.s16  q9,  q5, q0
-    vadd.s16  q10, q7, q1
-    vadd.s16  q15, q6, q3
-    vsub.s16  q13, q5, q0
-    vsub.s16  q14, q7, q1
-    STORE_IN_OUTPUT 0,  16, 31, q8,  q15
-    STORE_IN_OUTPUT 31, 17, 30, q9,  q10
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
-    ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;
-    ;step2[18] = dct_const_round_shift(temp1);
-    ;step2[29] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
-    STORE_IN_OUTPUT 30, 29, 18, q1, q0
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[19] = step1b[16][i] - step1b[19][i];
-    ;step1[28] = step1b[31][i] - step1b[28][i];
-    vsub.s16  q13, q4, q2
-    vsub.s16  q14, q6, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
-    ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;
-    ;step2[19] = dct_const_round_shift(temp1);
-    ;step2[28] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
-    STORE_IN_OUTPUT 18, 19, 28, q4, q6
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK B: 20-23,24-27
-    ; --------------------------------------------------------------------------
-    ; generate 20,21,26,27
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
-    ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;
-    ;step1b[20][i] = dct_const_round_shift(temp1);
-    ;step1b[27][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 7, 5, 27
-    DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
-    ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
-    ;step1b[21][i] = dct_const_round_shift(temp1);
-    ;step1b[26][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 27, 21, 11
-    DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[20] =  step1b[20][i] + step1b[21][i];
-    ;step2[21] =  step1b[20][i] - step1b[21][i];
-    ;step2[26] = -step1b[26][i] + step1b[27][i];
-    ;step2[27] =  step1b[26][i] + step1b[27][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
-    ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
-    ;step3[21] = dct_const_round_shift(temp1);
-    ;step3[26] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 22,23,24,25
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
-    ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
-    ;step1b[22][i] = dct_const_round_shift(temp1);
-    ;step1b[25][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 11, 13, 19
-    DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;
-    ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
-    ;step1b[23][i] = dct_const_round_shift(temp1);
-    ;step1b[24][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 19, 29, 3
-    DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[22] = -step1b[22][i] + step1b[23][i];
-    ;step2[23] =  step1b[22][i] + step1b[23][i];
-    ;step2[24] =  step1b[24][i] + step1b[25][i];
-    ;step2[25] =  step1b[24][i] - step1b[25][i];
-    vsub.s16  q14, q4, q5
-    vadd.s16  q5, q4, q5
-    vsub.s16  q13, q6, q7
-    vadd.s16  q6, q6, q7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
-    ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
-    ;step3[25] = dct_const_round_shift(temp1);
-    ;step3[22] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
-    ; --------------------------------------------------------------------------
-    ; combine 20-23,24-27
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[22] = step1b[22][i] + step1b[21][i];
-    ;step1[23] = step1b[23][i] + step1b[20][i];
-    vadd.s16  q10, q7, q1
-    vadd.s16  q11, q5, q0
-    ;step1[24] = step1b[24][i] + step1b[27][i];
-    ;step1[25] = step1b[25][i] + step1b[26][i];
-    vadd.s16  q12, q6, q2
-    vadd.s16  q15, q4, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[16] = step1b[16][i] + step1b[23][i];
-    ;step3[17] = step1b[17][i] + step1b[22][i];
-    ;step3[22] = step1b[17][i] - step1b[22][i];
-    ;step3[23] = step1b[16][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
-    vadd.s16  q8,  q14, q11
-    vadd.s16  q9,  q13, q10
-    vsub.s16  q13, q13, q10
-    vsub.s16  q11, q14, q11
-    STORE_IN_OUTPUT 17, 17, 16, q9, q8
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[24] = step1b[31][i] - step1b[24][i];
-    ;step3[25] = step1b[30][i] - step1b[25][i];
-    ;step3[30] = step1b[30][i] + step1b[25][i];
-    ;step3[31] = step1b[31][i] + step1b[24][i];
-    LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
-    vsub.s16  q8,  q9,  q12
-    vadd.s16  q10, q14, q15
-    vsub.s16  q14, q14, q15
-    vadd.s16  q12, q9,  q12
-    STORE_IN_OUTPUT 31, 30, 31, q10, q12
-    ; --------------------------------------------------------------------------
-    ; TODO(cd) do some register allocation change to remove these push/pop
-    vpush {q8}  ; [24]
-    vpush {q11} ; [23]
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
-    ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
-    ;step1[22] = dct_const_round_shift(temp1);
-    ;step1[25] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 31, 25, 22, q14, q13
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
-    ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
-    ;step1[23] = dct_const_round_shift(temp1);
-    ;step1[24] = dct_const_round_shift(temp2);
-    ; TODO(cd) do some register allocation change to remove these push/pop
-    vpop  {q13} ; [23]
-    vpop  {q14} ; [24]
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 22, 24, 23, q14, q13
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[20] = step1b[23][i] - step1b[20][i];
-    ;step1[27] = step1b[24][i] - step1b[27][i];
-    vsub.s16  q14, q5, q0
-    vsub.s16  q13, q6, q2
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);
-    ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
-    ;step2[27] = dct_const_round_shift(temp1);
-    ;step2[20] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[21] = step1b[22][i] - step1b[21][i];
-    ;step1[26] = step1b[25][i] - step1b[26][i];
-    vsub.s16  q14,  q7, q1
-    vsub.s16  q13,  q4, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);
-    ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
-    ;step2[26] = dct_const_round_shift(temp1);
-    ;step2[21] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[18] = step1b[18][i] + step1b[21][i];
-    ;step3[19] = step1b[19][i] + step1b[20][i];
-    ;step3[20] = step1b[19][i] - step1b[20][i];
-    ;step3[21] = step1b[18][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
-    vadd.s16  q8,  q14, q1
-    vadd.s16  q9,  q13, q6
-    vsub.s16  q13, q13, q6
-    vsub.s16  q1,  q14, q1
-    STORE_IN_OUTPUT 19, 18, 19, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[27] = step1b[28][i] - step1b[27][i];
-    ;step3[28] = step1b[28][i] + step1b[27][i];
-    ;step3[29] = step1b[29][i] + step1b[26][i];
-    ;step3[26] = step1b[29][i] - step1b[26][i];
-    LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
-    vsub.s16  q14, q8, q5
-    vadd.s16  q10, q8, q5
-    vadd.s16  q11, q9, q0
-    vsub.s16  q0, q9, q0
-    STORE_IN_OUTPUT 29, 28, 29, q10, q11
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
-    ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
-    ;step1[20] = dct_const_round_shift(temp1);
-    ;step1[27] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 29, 20, 27, q13, q14
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
-    ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
-    ;step1[21] = dct_const_round_shift(temp1);
-    ;step1[26] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
-    STORE_IN_OUTPUT 27, 21, 26, q1, q0
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK C: 8-10,11-15
-    ; --------------------------------------------------------------------------
-    ; generate 8,9,14,15
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
-    ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
-    ;step2[8] = dct_const_round_shift(temp1);
-    ;step2[15] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 3, 2, 30
-    DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
-    ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
-    ;step2[9] = dct_const_round_shift(temp1);
-    ;step2[14] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 30, 18, 14
-    DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;step3[8] = step1b[8][i] + step1b[9][i];
-    ;step3[9] = step1b[8][i] - step1b[9][i];
-    ;step3[14] = step1b[15][i] - step1b[14][i];
-    ;step3[15] = step1b[15][i] + step1b[14][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
-    ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;
-    ;step1[9]  = dct_const_round_shift(temp1);
-    ;step1[14] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 10,11,12,13
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
-    ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
-    ;step2[10] = dct_const_round_shift(temp1);
-    ;step2[13] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 14, 10, 22
-    DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
-    ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
-    ;step2[11] = dct_const_round_shift(temp1);
-    ;step2[12] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 22, 26, 6
-    DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;step3[10] = step1b[11][i] - step1b[10][i];
-    ;step3[11] = step1b[11][i] + step1b[10][i];
-    ;step3[12] = step1b[12][i] + step1b[13][i];
-    ;step3[13] = step1b[12][i] - step1b[13][i];
-    vsub.s16  q14, q4, q5
-    vadd.s16  q5, q4, q5
-    vsub.s16  q13, q6, q7
-    vadd.s16  q6, q6, q7
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);
-    ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
-    ;step1[13] = dct_const_round_shift(temp1);
-    ;step1[10] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
-    ; --------------------------------------------------------------------------
-    ; combine 8-10,11-15
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[8]  = step1b[8][i] + step1b[11][i];
-    ;step2[9]  = step1b[9][i] + step1b[10][i];
-    ;step2[10] = step1b[9][i] - step1b[10][i];
-    vadd.s16  q8,  q0, q5
-    vadd.s16  q9,  q1, q7
-    vsub.s16  q13, q1, q7
-    ;step2[13] = step1b[14][i] - step1b[13][i];
-    ;step2[14] = step1b[14][i] + step1b[13][i];
-    ;step2[15] = step1b[15][i] + step1b[12][i];
-    vsub.s16  q14, q3, q4
-    vadd.s16  q10, q3, q4
-    vadd.s16  q15, q2, q6
-    STORE_IN_OUTPUT 26, 8, 15, q8, q15
-    STORE_IN_OUTPUT 15, 9, 14, q9, q10
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
-    ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
-    ;step3[10] = dct_const_round_shift(temp1);
-    ;step3[13] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    STORE_IN_OUTPUT 14, 13, 10, q3, q1
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[11] = step1b[8][i] - step1b[11][i];
-    ;step2[12] = step1b[15][i] - step1b[12][i];
-    vsub.s16  q13, q0, q5
-    vsub.s16  q14,  q2, q6
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
-    ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
-    ;step3[11] = dct_const_round_shift(temp1);
-    ;step3[12] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    STORE_IN_OUTPUT 10, 11, 12, q1, q3
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK D: 0-3,4-7
-    ; --------------------------------------------------------------------------
-    ; generate 4,5,6,7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
-    ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
-    ;step3[4] = dct_const_round_shift(temp1);
-    ;step3[7] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 6, 4, 28
-    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
-    ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
-    ;step3[5] = dct_const_round_shift(temp1);
-    ;step3[6] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 28, 20, 12
-    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[4] = step1b[4][i] + step1b[5][i];
-    ;step1[5] = step1b[4][i] - step1b[5][i];
-    ;step1[6] = step1b[7][i] - step1b[6][i];
-    ;step1[7] = step1b[7][i] + step1b[6][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
-    ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
-    ;step2[5] = dct_const_round_shift(temp1);
-    ;step2[6] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 0,1,2,3
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
-    ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
-    ;step1[1] = dct_const_round_shift(temp1);
-    ;step1[0] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 12, 0, 16
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
-    ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
-    ;step1[2] = dct_const_round_shift(temp1);
-    ;step1[3] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 16, 8, 24
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[0] = step1b[0][i] + step1b[3][i];
-    ;step2[1] = step1b[1][i] + step1b[2][i];
-    ;step2[2] = step1b[1][i] - step1b[2][i];
-    ;step2[3] = step1b[0][i] - step1b[3][i];
-    vadd.s16  q4, q7, q6
-    vsub.s16  q7, q7, q6
-    vsub.s16  q6, q5, q14
-    vadd.s16  q5, q5, q14
-    ; --------------------------------------------------------------------------
-    ; combine 0-3,4-7
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[0] = step1b[0][i] + step1b[7][i];
-    ;step3[1] = step1b[1][i] + step1b[6][i];
-    ;step3[2] = step1b[2][i] + step1b[5][i];
-    ;step3[3] = step1b[3][i] + step1b[4][i];
-    vadd.s16  q8,  q4, q2
-    vadd.s16  q9,  q5, q3
-    vadd.s16  q10, q6, q1
-    vadd.s16  q11, q7, q0
-    ;step3[4] = step1b[3][i] - step1b[4][i];
-    ;step3[5] = step1b[2][i] - step1b[5][i];
-    ;step3[6] = step1b[1][i] - step1b[6][i];
-    ;step3[7] = step1b[0][i] - step1b[7][i];
-    vsub.s16  q12, q7, q0
-    vsub.s16  q13, q6, q1
-    vsub.s16  q14, q5, q3
-    vsub.s16  q15, q4, q2
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[0] = step1b[0][i] + step1b[15][i];
-    ;step1[1] = step1b[1][i] + step1b[14][i];
-    ;step1[14] = step1b[1][i] - step1b[14][i];
-    ;step1[15] = step1b[0][i] - step1b[15][i];
-    LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
-    vadd.s16  q2, q8, q1
-    vadd.s16  q3, q9, q0
-    vsub.s16  q4, q9, q0
-    vsub.s16  q5, q8, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[14 * 32] = step1b[14][i] + step1b[17][i];
-    ;output[15 * 32] = step1b[15][i] + step1b[16][i];
-    ;output[16 * 32] = step1b[15][i] - step1b[16][i];
-    ;output[17 * 32] = step1b[14][i] - step1b[17][i];
-    LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-
-    cmp r5, #0
-    bgt idct32_bands_end_2nd_pass
-
-idct32_bands_end_1st_pass
-    STORE_IN_OUTPUT 17, 16, 17, q6, q7
-    STORE_IN_OUTPUT 17, 14, 15, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
-    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
-    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
-    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
-    LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 31, 30, 31, q6, q7
-    STORE_IN_OUTPUT 31,  0,  1, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[2] = step1b[2][i] + step1b[13][i];
-    ;step1[3] = step1b[3][i] + step1b[12][i];
-    ;step1[12] = step1b[3][i] - step1b[12][i];
-    ;step1[13] = step1b[2][i] - step1b[13][i];
-    LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
-    vadd.s16  q2, q10, q1
-    vadd.s16  q3, q11, q0
-    vsub.s16  q4, q11, q0
-    vsub.s16  q5, q10, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
-    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
-    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
-    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
-    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 19, 18, 19, q6, q7
-    STORE_IN_OUTPUT 19, 12, 13, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
-    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
-    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
-    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
-    LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 29, 28, 29, q6, q7
-    STORE_IN_OUTPUT 29,  2,  3, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[4] = step1b[4][i] + step1b[11][i];
-    ;step1[5] = step1b[5][i] + step1b[10][i];
-    ;step1[10] = step1b[5][i] - step1b[10][i];
-    ;step1[11] = step1b[4][i] - step1b[11][i];
-    LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
-    vadd.s16  q2, q12, q1
-    vadd.s16  q3, q13, q0
-    vsub.s16  q4, q13, q0
-    vsub.s16  q5, q12, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
-    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
-    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
-    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 21, 20, 21, q6, q7
-    STORE_IN_OUTPUT 21, 10, 11, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
-    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
-    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
-    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
-    LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 27, 26, 27, q6, q7
-    STORE_IN_OUTPUT 27,  4,  5, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[6] = step1b[6][i] + step1b[9][i];
-    ;step1[7] = step1b[7][i] + step1b[8][i];
-    ;step1[8] = step1b[7][i] - step1b[8][i];
-    ;step1[9] = step1b[6][i] - step1b[9][i];
-    LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
-    vadd.s16  q2, q14, q1
-    vadd.s16  q3, q15, q0
-    vsub.s16  q4, q15, q0
-    vsub.s16  q5, q14, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
-    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
-    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
-    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 23, 22, 23, q6, q7
-    STORE_IN_OUTPUT 23, 8, 9, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
-    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
-    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
-    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
-    LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 25, 24, 25, q6, q7
-    STORE_IN_OUTPUT 25,  6,  7, q4, q5
-
-    ; restore r0 by removing the last offset from the last
-    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
-    sub r0, r0, #24*8*2
-    ; restore r1 by removing the last offset from the last
-    ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2
-    ; advance by 8 columns => 8*2
-    sub r1, r1, #7*32*2 - 8*2
-    ;   advance by 8 lines (8*32*2)
-    ;   go back by the two pairs from the loop (32*2)
-    add r3, r3, #8*32*2 - 32*2
-
-    ; bands loop processing
-    subs r4, r4, #1
-    bne idct32_bands_loop
-
-    ; parameters for second pass
-    ; the input of pass2 is the result of pass1. we have to remove the offset
-    ;   of 32 columns induced by the above idct32_bands_loop
-    sub r3, r1, #32*2
-      ; r1 = pass2[32 * 32]
-    add r1, sp, #2048
-
-    ; pass loop processing
-    add r5, r5, #1
-    b idct32_pass_loop
-
-idct32_bands_end_2nd_pass
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
-    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
-    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
-    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
-    LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[2] = step1b[2][i] + step1b[13][i];
-    ;step1[3] = step1b[3][i] + step1b[12][i];
-    ;step1[12] = step1b[3][i] - step1b[12][i];
-    ;step1[13] = step1b[2][i] - step1b[13][i];
-    LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
-    vadd.s16  q2, q10, q1
-    vadd.s16  q3, q11, q0
-    vsub.s16  q4, q11, q0
-    vsub.s16  q5, q10, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
-    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
-    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
-    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
-    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
-    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
-    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
-    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
-    LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[4] = step1b[4][i] + step1b[11][i];
-    ;step1[5] = step1b[5][i] + step1b[10][i];
-    ;step1[10] = step1b[5][i] - step1b[10][i];
-    ;step1[11] = step1b[4][i] - step1b[11][i];
-    LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
-    vadd.s16  q2, q12, q1
-    vadd.s16  q3, q13, q0
-    vsub.s16  q4, q13, q0
-    vsub.s16  q5, q12, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
-    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
-    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
-    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
-    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
-    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
-    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
-    LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[6] = step1b[6][i] + step1b[9][i];
-    ;step1[7] = step1b[7][i] + step1b[8][i];
-    ;step1[8] = step1b[7][i] - step1b[8][i];
-    ;step1[9] = step1b[6][i] - step1b[9][i];
-    LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
-    vadd.s16  q2, q14, q1
-    vadd.s16  q3, q15, q0
-    vsub.s16  q4, q15, q0
-    vsub.s16  q5, q14, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
-    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
-    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
-    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS_LAST
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
-    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
-    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
-    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
-    LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS_LAST
-    ; --------------------------------------------------------------------------
-    ; restore pointers to their initial indices for next band pass by
-    ;     removing/adding dest_stride * 8. The actual increment by eight
-    ;     is taken care of within the _LAST macros.
-    add r6,  r6,  r2, lsl #3
-    add r9,  r9,  r2, lsl #3
-    sub r7,  r7,  r2, lsl #3
-    sub r10, r10, r2, lsl #3
-
-    ; restore r0 by removing the last offset from the last
-    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
-    sub r0, r0, #24*8*2
-    ; restore r1 by removing the last offset from the last
-    ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
-    ; advance by 8 columns => 8*2
-    sub r1, r1, #25*32*2 - 8*2
-    ;   advance by 8 lines (8*32*2)
-    ;   go back by the two pairs from the loop (32*2)
-    add r3, r3, #8*32*2 - 32*2
-
-    ; bands loop processing
-    subs r4, r4, #1
-    bne idct32_bands_loop
-
-    ; stack operation
-    add sp, sp, #512+2048+2048
-    vpop {d8-d15}
-    pop  {r4-r11}
-    bx              lr
-    ENDP  ; |vpx_idct32x32_1024_add_neon|
-    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
index adab715dde5..cbfab361af8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
@@ -25,9 +25,8 @@
 |vpx_idct4x4_1_add_neon| PROC
     ldrsh            r0, [r0]
 
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
+    ; cospi_16_64 = 11585
+    movw             r12, #0x2d41
 
     ; out = dct_const_round_shift(input[0] * cospi_16_64)
     mul              r0, r0, r12               ; input[0] * cospi_16_64
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
index b37cb51a1a7..525aac05a84 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -21,7 +21,7 @@ void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
   uint16x8_t q8u16;
   int16x8_t q0s16;
   uint8_t *d1, *d2;
-  int16_t i, a1, cospi_16_64 = 11585;
+  int16_t i, a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 4);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
index 877fbd63435..bd4e86ded25 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -15,6 +15,8 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
+    INCLUDE vpx_dsp/arm/idct_neon.asm.S
+
     AREA     Block, CODE, READONLY ; name this block of code
 ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
@@ -33,18 +35,15 @@
     ; So, two passes of a transpose followed by a column transform.
 
     ; load the inputs into q8-q9, d16-d19
-    vld1.s16        {q8,q9}, [r0]!
+    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
 
     ; generate scalar constants
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov             r0, #0x3b00
-    add             r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov             r3, #0x2d00
-    add             r3, #0x41
-    ; cospi_24_64 = 6270 = 0x 187e
-    mov             r12, #0x1800
-    add             r12, #0x7e
+    ; cospi_8_64 = 15137
+    movw            r0, #0x3b21
+    ; cospi_16_64 = 11585
+    movw            r3, #0x2d41
+    ; cospi_24_64 = 6270
+    movw            r12, #0x187e
 
     ; transpose the input data
     ; 00 01 02 03   d16
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
index 1caa456987d..8f669c90765 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
@@ -11,6 +11,8 @@
 #include <arm_neon.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/txfm_common.h"
 
 void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
                              int dest_stride) {
@@ -24,14 +26,11 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
   int16x4x2_t d0x2s16, d1x2s16;
   int32x4x2_t q0x2s32;
   uint8_t *d;
-  int16_t cospi_8_64 = 15137;
-  int16_t cospi_16_64 = 11585;
-  int16_t cospi_24_64 = 6270;
 
   d26u32 = d27u32 = vdup_n_u32(0);
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
+  q8s16 = load_tran_low_to_s16(input);
+  q9s16 = load_tran_low_to_s16(input + 8);
 
   d16s16 = vget_low_s16(q8s16);
   d17s16 = vget_high_s16(q8s16);
@@ -43,8 +42,8 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
   q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
   q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
 
-  d20s16 = vdup_n_s16(cospi_8_64);
-  d21s16 = vdup_n_s16(cospi_16_64);
+  d20s16 = vdup_n_s16((int16_t)cospi_8_64);
+  d21s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q0x2s32 =
       vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
@@ -53,7 +52,7 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
   d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
   d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
 
-  d22s16 = vdup_n_s16(cospi_24_64);
+  d22s16 = vdup_n_s16((int16_t)cospi_24_64);
 
   // stage 1
   d23s16 = vadd_s16(d16s16, d18s16);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
index dbbff364f37..e4531c6e97f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
@@ -25,9 +25,8 @@
 |vpx_idct8x8_1_add_neon| PROC
     ldrsh            r0, [r0]
 
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
+    ; cospi_16_64 = 11585
+    movw             r12, #0x2d41
 
     ; out = dct_const_round_shift(input[0] * cospi_16_64)
     mul              r0, r0, r12               ; input[0] * cospi_16_64
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
index df557de8187..eee41e6c6b1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -21,7 +21,7 @@ void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
   uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
   int16x8_t q0s16;
   uint8_t *d1, *d2;
-  int16_t i, a1, cospi_16_64 = 11585;
+  int16_t i, a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 5);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
index 6ab59b41b74..a5c9c927d67 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
@@ -16,6 +16,8 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
+    INCLUDE vpx_dsp/arm/idct_neon.asm.S
+
     ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
     ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
     ; This macro will touch q0-q7 registers and use them as buffer during
@@ -207,41 +209,34 @@
 |vpx_idct8x8_64_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
+    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
+    LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0
+    LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0
+    LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0
 
     ; transpose the input data
     TRANSPOSE8X8
 
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
+    ; cospi_28_64 = 3196
+    movw            r3, #0x0c7c
 
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
+    ; cospi_4_64  = 16069
+    movw            r4, #0x3ec5
 
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
+    ; cospi_12_64 = 13623
+    movw            r5, #0x3537
 
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
+    ; cospi_20_64 = 9102
+    movw            r6, #0x238e
 
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
+    ; cospi_16_64 = 11585
+    movw            r7, #0x2d41
 
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
+    ; cospi_24_64 = 6270
+    movw            r8, #0x187e
 
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
+    ; cospi_8_64 = 15137
+    movw            r9, #0x3b21
 
     ; First transform rows
     IDCT8x8_1D
@@ -319,41 +314,34 @@
 |vpx_idct8x8_12_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
+    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
+    LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0
+    LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0
+    LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0
 
     ; transpose the input data
     TRANSPOSE8X8
 
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
+    ; cospi_28_64 = 3196
+    movw            r3, #0x0c7c
 
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
+    ; cospi_4_64  = 16069
+    movw            r4, #0x3ec5
 
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
+    ; cospi_12_64 = 13623
+    movw            r5, #0x3537
 
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
+    ; cospi_20_64 = 9102
+    movw            r6, #0x238e
 
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
+    ; cospi_16_64 = 11585
+    movw            r7, #0x2d41
 
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
+    ; cospi_24_64 = 6270
+    movw            r8, #0x187e
 
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
+    ; cospi_8_64 = 15137
+    movw            r9, #0x3b21
 
     ; First transform rows
     ; stage 1
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
index 7d65612417c..159a6ec9891 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
@@ -12,6 +12,7 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
@@ -27,10 +28,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
   int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
 
-  d0s16 = vdup_n_s16(cospi_28_64);
-  d1s16 = vdup_n_s16(cospi_4_64);
-  d2s16 = vdup_n_s16(cospi_12_64);
-  d3s16 = vdup_n_s16(cospi_20_64);
+  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
+  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
+  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
+  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
 
   d16s16 = vget_low_s16(*q8s16);
   d17s16 = vget_high_s16(*q8s16);
@@ -83,7 +84,7 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q6s16 = vcombine_s16(d12s16, d13s16);
   q7s16 = vcombine_s16(d14s16, d15s16);
 
-  d0s16 = vdup_n_s16(cospi_16_64);
+  d0s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q2s32 = vmull_s16(d16s16, d0s16);
   q3s32 = vmull_s16(d17s16, d0s16);
@@ -95,8 +96,8 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
   q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
 
-  d0s16 = vdup_n_s16(cospi_24_64);
-  d1s16 = vdup_n_s16(cospi_8_64);
+  d0s16 = vdup_n_s16((int16_t)cospi_24_64);
+  d1s16 = vdup_n_s16((int16_t)cospi_8_64);
 
   d18s16 = vqrshrn_n_s32(q2s32, 14);
   d19s16 = vqrshrn_n_s32(q3s32, 14);
@@ -136,7 +137,7 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   d28s16 = vget_low_s16(*q14s16);
   d29s16 = vget_high_s16(*q14s16);
 
-  d16s16 = vdup_n_s16(cospi_16_64);
+  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q9s32 = vmull_s16(d28s16, d16s16);
   q10s32 = vmull_s16(d29s16, d16s16);
@@ -173,14 +174,14 @@ void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
   uint16x8_t q8u16, q9u16, q10u16, q11u16;
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 16);
-  q11s16 = vld1q_s16(input + 24);
-  q12s16 = vld1q_s16(input + 32);
-  q13s16 = vld1q_s16(input + 40);
-  q14s16 = vld1q_s16(input + 48);
-  q15s16 = vld1q_s16(input + 56);
+  q8s16 = load_tran_low_to_s16(input);
+  q9s16 = load_tran_low_to_s16(input + 8);
+  q10s16 = load_tran_low_to_s16(input + 16);
+  q11s16 = load_tran_low_to_s16(input + 24);
+  q12s16 = load_tran_low_to_s16(input + 32);
+  q13s16 = load_tran_low_to_s16(input + 40);
+  q14s16 = load_tran_low_to_s16(input + 48);
+  q15s16 = load_tran_low_to_s16(input + 56);
 
   transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                     &q15s16);
@@ -279,43 +280,43 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
   uint16x8_t q8u16, q9u16, q10u16, q11u16;
   int32x4_t q9s32, q10s32, q11s32, q12s32;
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 16);
-  q11s16 = vld1q_s16(input + 24);
-  q12s16 = vld1q_s16(input + 32);
-  q13s16 = vld1q_s16(input + 40);
-  q14s16 = vld1q_s16(input + 48);
-  q15s16 = vld1q_s16(input + 56);
+  q8s16 = load_tran_low_to_s16(input);
+  q9s16 = load_tran_low_to_s16(input + 8);
+  q10s16 = load_tran_low_to_s16(input + 16);
+  q11s16 = load_tran_low_to_s16(input + 24);
+  q12s16 = load_tran_low_to_s16(input + 32);
+  q13s16 = load_tran_low_to_s16(input + 40);
+  q14s16 = load_tran_low_to_s16(input + 48);
+  q15s16 = load_tran_low_to_s16(input + 56);
 
   transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                     &q15s16);
 
   // First transform rows
   // stage 1
-  q0s16 = vdupq_n_s16(cospi_28_64 * 2);
-  q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+  q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
+  q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
 
   q4s16 = vqrdmulhq_s16(q9s16, q0s16);
 
-  q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
+  q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2);
 
   q7s16 = vqrdmulhq_s16(q9s16, q1s16);
 
-  q1s16 = vdupq_n_s16(cospi_12_64 * 2);
+  q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2);
 
   q5s16 = vqrdmulhq_s16(q11s16, q0s16);
 
-  q0s16 = vdupq_n_s16(cospi_16_64 * 2);
+  q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
 
   q6s16 = vqrdmulhq_s16(q11s16, q1s16);
 
   // stage 2 & stage 3 - even half
-  q1s16 = vdupq_n_s16(cospi_24_64 * 2);
+  q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2);
 
   q9s16 = vqrdmulhq_s16(q8s16, q0s16);
 
-  q0s16 = vdupq_n_s16(cospi_8_64 * 2);
+  q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2);
 
   q13s16 = vqrdmulhq_s16(q10s16, q1s16);
 
@@ -337,7 +338,7 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
   d28s16 = vget_low_s16(q14s16);
   d29s16 = vget_high_s16(q14s16);
 
-  d16s16 = vdup_n_s16(cospi_16_64);
+  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
   q9s32 = vmull_s16(d28s16, d16s16);
   q10s32 = vmull_s16(d29s16, d16s16);
   q11s32 = vmull_s16(d28s16, d16s16);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm
new file mode 100644
index 00000000000..f39e8ddd4b4
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm
@@ -0,0 +1,30 @@
+;
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    INCLUDE ./vpx_config.asm
+
+    ; Helper function used to load tran_low_t into int16, narrowing if
+    ; necessary.
+    ; $dst0..3 are d registers with the pairs assumed to be contiguous in
+    ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth.
+    MACRO
+    LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src
+    IF CONFIG_VP9_HIGHBITDEPTH
+    vld1.s32        {q0,q1}, [$src]!
+    vld1.s32        {q2,q3}, [$src]!
+    vmovn.i32       $dst0, q0
+    vmovn.i32       $dst1, q1
+    vmovn.i32       $dst2, q2
+    vmovn.i32       $dst3, q3
+    ELSE
+    vld1.s16        {$dst0-$dst1,$dst2-$dst3}, [$src]!
+    ENDIF
+    MEND
+    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
new file mode 100644
index 00000000000..5c2a53c034f
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
@@ -0,0 +1,172 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_IDCT_NEON_H_
+#define VPX_DSP_ARM_IDCT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+//------------------------------------------------------------------------------
+
+// Helper function used to load tran_low_t into int16, narrowing if necessary.
+static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vld1q_s32(buf);
+  const int32x4_t v1 = vld1q_s32(buf + 4);
+  const int16x4_t s0 = vmovn_s32(v0);
+  const int16x4_t s1 = vmovn_s32(v1);
+  return vcombine_s16(s0, s1);
+#else
+  return vld1q_s16(buf);
+#endif
+}
+
+// Multiply a by a_const. Saturate, shift and narrow by 14.
+static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
+                                                      const int16_t a_const) {
+  // Shift by 14 + rounding will be within 16 bits for well formed streams.
+  // See WRAPLOW and dct_const_round_shift for details.
+  // This instruction doubles the result and returns the high half, essentially
+  // resulting in a right shift by 15. By multiplying the constant first that
+  // becomes a right shift by 14.
+  // The largest possible value used here is
+  // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
+  // within the range of int16_t (+32767 / -32768) even when negated.
+  return vqrdmulhq_n_s16(a, a_const * 2);
+}
+
+// Add a and b, then multiply by ab_const. Shift and narrow by 14.
+static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
+    const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+  // In both add_ and it's pair, sub_, the input for well-formed streams will be
+  // well within 16 bits (input to the idct is the difference between two frames
+  // and will be within -255 to 255, or 9 bits)
+  // However, for inputs over about 25,000 (valid for int16_t, but not for idct
+  // input) this function can not use vaddq_s16.
+  // In order to match existing behavior and intentionally out of range tests,
+  // expand the addition up to 32 bits to prevent truncation.
+  int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+  int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+  temp_low = vmulq_n_s32(temp_low, ab_const);
+  temp_high = vmulq_n_s32(temp_high, ab_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+// Subtract b from a, then multiply by ab_const. Shift and narrow by 14.
+static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
+    const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+  int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+  int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+  temp_low = vmulq_n_s32(temp_low, ab_const);
+  temp_high = vmulq_n_s32(temp_high, ab_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
+// 14.
+static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
+    const int16x8_t a, const int16_t a_const, const int16x8_t b,
+    const int16_t b_const) {
+  int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const);
+  int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
+  temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
+  temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+static INLINE void load_and_transpose_s16_8x8(const int16_t *a, int a_stride,
+                                              int16x8_t *a0, int16x8_t *a1,
+                                              int16x8_t *a2, int16x8_t *a3,
+                                              int16x8_t *a4, int16x8_t *a5,
+                                              int16x8_t *a6, int16x8_t *a7) {
+  *a0 = vld1q_s16(a);
+  a += a_stride;
+  *a1 = vld1q_s16(a);
+  a += a_stride;
+  *a2 = vld1q_s16(a);
+  a += a_stride;
+  *a3 = vld1q_s16(a);
+  a += a_stride;
+  *a4 = vld1q_s16(a);
+  a += a_stride;
+  *a5 = vld1q_s16(a);
+  a += a_stride;
+  *a6 = vld1q_s16(a);
+  a += a_stride;
+  *a7 = vld1q_s16(a);
+
+  transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+// Shift the output down by 6 and add it to the destination buffer.
+static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
+                                        const int16x8_t a2, const int16x8_t a3,
+                                        const int16x8_t a4, const int16x8_t a5,
+                                        const int16x8_t a6, const int16x8_t a7,
+                                        uint8_t *b, const int b_stride) {
+  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+  b0 = vld1_u8(b);
+  b += b_stride;
+  b1 = vld1_u8(b);
+  b += b_stride;
+  b2 = vld1_u8(b);
+  b += b_stride;
+  b3 = vld1_u8(b);
+  b += b_stride;
+  b4 = vld1_u8(b);
+  b += b_stride;
+  b5 = vld1_u8(b);
+  b += b_stride;
+  b6 = vld1_u8(b);
+  b += b_stride;
+  b7 = vld1_u8(b);
+  b -= (7 * b_stride);
+
+  // c = b + (a >> 6)
+  c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6);
+  c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6);
+  c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6);
+  c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6);
+  c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6);
+  c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6);
+  c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6);
+  c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6);
+
+  b0 = vqmovun_s16(c0);
+  b1 = vqmovun_s16(c1);
+  b2 = vqmovun_s16(c2);
+  b3 = vqmovun_s16(c3);
+  b4 = vqmovun_s16(c4);
+  b5 = vqmovun_s16(c5);
+  b6 = vqmovun_s16(c6);
+  b7 = vqmovun_s16(c7);
+
+  vst1_u8(b, b0);
+  b += b_stride;
+  vst1_u8(b, b1);
+  b += b_stride;
+  vst1_u8(b, b2);
+  b += b_stride;
+  vst1_u8(b, b3);
+  b += b_stride;
+  vst1_u8(b, b4);
+  b += b_stride;
+  vst1_u8(b, b5);
+  b += b_stride;
+  vst1_u8(b, b6);
+  b += b_stride;
+  vst1_u8(b, b7);
+}
+#endif  // VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c
index 38e79ed69dd..e150a5302d5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c
@@ -17,306 +17,254 @@
 //------------------------------------------------------------------------------
 // DC 4x4
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *left, int do_above, int do_left) {
-  uint16x4_t sum_top;
-  uint16x4_t sum_left;
-  uint16x4_t dc0;
-
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    sum_top = vpadd_u16(p0, p0);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);   // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    sum_left = vpadd_u16(p0, p0);
-  }
-
-  if (do_above && do_left) {
-    const uint16x4_t sum = vadd_u16(sum_left, sum_top);
-    dc0 = vrshr_n_u16(sum, 3);
-  } else if (do_above) {
-    dc0 = vrshr_n_u16(sum_top, 2);
-  } else if (do_left) {
-    dc0 = vrshr_n_u16(sum_left, 2);
-  } else {
-    dc0 = vdup_n_u16(0x80);
-  }
+static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) {
+  const uint8x8_t ref_u8 = vld1_u8(ref);
+  const uint16x4_t p0 = vpaddl_u8(ref_u8);
+  return vpadd_u16(p0, p0);
+}
 
-  {
-    const uint8x8_t dc = vdup_lane_u8(vreinterpret_u8_u16(dc0), 0);
-    int i;
-    for (i = 0; i < 4; ++i) {
-      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
-    }
+static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
+                                const uint8x8_t dc) {
+  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0);
   }
 }
 
 void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  dc_4x4(dst, stride, above, left, 1, 1);
+  const uint8x8_t a = vld1_u8(above);
+  const uint8x8_t l = vld1_u8(left);
+  const uint16x8_t al = vaddl_u8(a, l);
+  uint16x4_t sum;
+  uint8x8_t dc;
+  sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al));
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_4(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
   (void)above;
-  dc_4x4(dst, stride, NULL, left, 0, 1);
+  dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_4(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
   (void)left;
-  dc_4x4(dst, stride, above, NULL, 1, 0);
+  dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+  dc_store_4x4(dst, stride, dc);
 }
 
 //------------------------------------------------------------------------------
 // DC 8x8
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *left, int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_top = vcombine_u16(p2, p2);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);   // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_left = vcombine_u16(p2, p2);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 4);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 3);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 3);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
+static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) {
+  const uint8x8_t ref_u8 = vld1_u8(ref);
+  uint16x4_t sum = vpaddl_u8(ref_u8);
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
 
-  {
-    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 8; ++i) {
-      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
-    }
+static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
+                                const uint8x8_t dc) {
+  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    vst1_u8(dst, dc_dup);
   }
 }
 
 void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  dc_8x8(dst, stride, above, left, 1, 1);
+  const uint8x8_t above_u8 = vld1_u8(above);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8);
+  const uint16x8_t p0 = vpaddlq_u8(above_and_left);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  uint8x8_t dc;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_8(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
   (void)above;
-  dc_8x8(dst, stride, NULL, left, 0, 1);
+  dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_8(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
   (void)left;
-  dc_8x8(dst, stride, above, NULL, 1, 0);
+  dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_8x8(dst, stride, NULL, NULL, 0, 0);
+  dc_store_8x8(dst, stride, dc);
 }
 
 //------------------------------------------------------------------------------
 // DC 16x16
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x16_t A = vld1q_u8(above);  // top row
-    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_top = vcombine_u16(p3, p3);
-  }
-
-  if (do_left) {
-    const uint8x16_t L = vld1q_u8(left);  // left row
-    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_left = vcombine_u16(p3, p3);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 5);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 4);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 4);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
+static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) {
+  const uint8x16_t ref_u8 = vld1q_u8(ref);
+  const uint16x8_t p0 = vpaddlq_u8(ref_u8);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
 
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 16; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-    }
+static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8x8_t dc) {
+  const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
+  int i;
+  for (i = 0; i < 16; ++i, dst += stride) {
+    vst1q_u8(dst, dc_dup);
   }
 }
 
 void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  dc_16x16(dst, stride, above, left, 1, 1);
+  const uint8x16_t ref0 = vld1q_u8(above);
+  const uint8x16_t ref1 = vld1q_u8(left);
+  const uint16x8_t p0 = vpaddlq_u8(ref0);
+  const uint16x8_t p1 = vpaddlq_u8(ref1);
+  const uint16x8_t p2 = vaddq_u16(p0, p1);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+  uint8x8_t dc;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_16(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
   (void)above;
-  dc_16x16(dst, stride, NULL, left, 0, 1);
+  dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_16(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
   (void)left;
-  dc_16x16(dst, stride, above, NULL, 1, 0);
+  dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_16x16(dst, stride, NULL, NULL, 0, 0);
+  dc_store_16x16(dst, stride, dc);
 }
 
 //------------------------------------------------------------------------------
 // DC 32x32
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x16_t A0 = vld1q_u8(above);  // top row
-    const uint8x16_t A1 = vld1q_u8(above + 16);
-    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
-    const uint16x8_t p1 = vpaddlq_u8(A1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_top = vcombine_u16(p5, p5);
-  }
-
-  if (do_left) {
-    const uint8x16_t L0 = vld1q_u8(left);  // left row
-    const uint8x16_t L1 = vld1q_u8(left + 16);
-    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
-    const uint16x8_t p1 = vpaddlq_u8(L1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_left = vcombine_u16(p5, p5);
-  }
+static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
+  const uint8x16x2_t r = vld2q_u8(ref);
+  const uint16x8_t p0 = vpaddlq_u8(r.val[0]);
+  const uint16x8_t p1 = vpaddlq_u8(r.val[1]);
+  const uint16x8_t p2 = vaddq_u16(p0, p1);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
 
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 6);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 5);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 5);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
+static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8x8_t dc) {
+  uint8x16x2_t dc_dup;
+  int i;
+  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0);
 
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 32; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-      vst1q_u8(dst + i * stride + 16, dc);
-    }
+  for (i = 0; i < 32; ++i, dst += stride) {
+    vst2q_u8(dst, dc_dup);
   }
 }
 
 void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  dc_32x32(dst, stride, above, left, 1, 1);
+  const uint8x16x2_t a = vld2q_u8(above);
+  const uint8x16x2_t l = vld2q_u8(left);
+  const uint16x8_t pa0 = vpaddlq_u8(a.val[0]);
+  const uint16x8_t pl0 = vpaddlq_u8(l.val[0]);
+  const uint16x8_t pa1 = vpaddlq_u8(a.val[1]);
+  const uint16x8_t pl1 = vpaddlq_u8(l.val[1]);
+  const uint16x8_t pa = vaddq_u16(pa0, pa1);
+  const uint16x8_t pl = vaddq_u16(pl0, pl1);
+  const uint16x8_t pal = vaddq_u16(pa, pl);
+  uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal));
+  uint8x8_t dc;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6));
+  dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_32(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
   (void)above;
-  dc_32x32(dst, stride, NULL, left, 0, 1);
+  dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_32(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
   (void)left;
-  dc_32x32(dst, stride, above, NULL, 1, 0);
+  dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_32x32(dst, stride, NULL, NULL, 0, 0);
+  dc_store_32x32(dst, stride, dc);
 }
 
 // -----------------------------------------------------------------------------
 
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
-  const uint64x1_t A1 = vshr_n_u64(A0, 8);
-  const uint64x1_t A2 = vshr_n_u64(A0, 16);
-  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t ABCDEFGH = vld1_u8(above);
+  const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8);
+  const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16);
   const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
   const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
   const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
@@ -331,485 +279,506 @@ void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
   vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
   vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-  dst[3 * stride + 3] = above[7];
+  vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7);
+}
+
+static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride,
+                               const uint8x8_t above_right, uint8x8_t *row) {
+  *row = vext_u8(*row, above_right, 1);
+  vst1_u8(*dst, *row);
+  *dst += stride;
 }
 
 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
-  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
-  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
-  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
-  const uint8x8_t A0 = vld1_u8(above);  // top row
-  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
-  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
+  const uint8x8_t A0 = vld1_u8(above);
+  const uint8x8_t above_right = vdup_lane_u8(A0, 7);
+  const uint8x8_t A1 = vext_u8(A0, above_right, 1);
+  const uint8x8_t A2 = vext_u8(A0, above_right, 2);
   const uint8x8_t avg1 = vhadd_u8(A0, A2);
   uint8x8_t row = vrhadd_u8(avg1, A1);
-  int i;
   (void)left;
-  for (i = 0; i < 7; ++i) {
-    vst1_u8(dst + i * stride, row);
-    row = vtbl1_u8(row, sh_12345677);
-  }
-  vst1_u8(dst + i * stride, row);
+
+  vst1_u8(dst, row);
+  dst += stride;
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  vst1_u8(dst, above_right);
+}
+
+static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride,
+                                const uint8x16_t above_right, uint8x16_t *row) {
+  *row = vextq_u8(*row, above_right, 1);
+  vst1q_u8(*dst, *row);
+  *dst += stride;
 }
 
 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0 = vld1q_u8(above);  // top row
-  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
+  const uint8x16_t A0 = vld1q_u8(above);
+  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7);
   const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
   const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
   const uint8x16_t avg1 = vhaddq_u8(A0, A2);
   uint8x16_t row = vrhaddq_u8(avg1, A1);
-  int i;
   (void)left;
-  for (i = 0; i < 15; ++i) {
-    vst1q_u8(dst + i * stride, row);
-    row = vextq_u8(row, above_right, 1);
-  }
-  vst1q_u8(dst + i * stride, row);
+
+  vst1q_u8(dst, row);
+  dst += stride;
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  vst1q_u8(dst, above_right);
 }
 
 // -----------------------------------------------------------------------------
 
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
-  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
-  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint8x8_t XABCD = vld1_u8(above - 1);
   const uint32x2_t zero = vdup_n_u32(0);
   const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
-  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
-  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
-  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
-  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
-  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
-  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
-  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
-  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
-  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint8x8_t LKJI = vrev64_u8(vreinterpret_u8_u32(IJKL));
+  const uint8x8_t LKJIXABC = vext_u8(LKJI, XABCD, 4);
+  const uint8x8_t KJIXABCD = vext_u8(LKJI, XABCD, 5);
+  const uint8x8_t JIXABCD0 =
+      vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(KJIXABCD), 8));
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD0, LKJIXABC);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABCD);
   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
   const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
-  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
-  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+  vst1_lane_u32((uint32_t *)dst, r0, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, r1, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, r2, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, r3, 0);
 }
 
+// -----------------------------------------------------------------------------
+
 #if !HAVE_NEON_ASM
 
 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
+  const uint32_t d = *(const uint32_t *)above;
   int i;
-  uint32x2_t d0u32 = vdup_n_u32(0);
   (void)left;
 
-  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
-  for (i = 0; i < 4; i++, dst += stride)
-    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+  for (i = 0; i < 4; i++, dst += stride) {
+    *(uint32_t *)dst = d;
+  }
 }
 
 void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d = vld1_u8(above);
   int i;
-  uint8x8_t d0u8 = vdup_n_u8(0);
   (void)left;
 
-  d0u8 = vld1_u8(above);
-  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
+  for (i = 0; i < 8; i++, dst += stride) {
+    vst1_u8(dst, d);
+  }
 }
 
 void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d = vld1q_u8(above);
   int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
   (void)left;
 
-  q0u8 = vld1q_u8(above);
-  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
+  for (i = 0; i < 16; i++, dst += stride) {
+    vst1q_u8(dst, d);
+  }
 }
 
 void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
   int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
   (void)left;
 
-  q0u8 = vld1q_u8(above);
-  q1u8 = vld1q_u8(above + 16);
-  for (i = 0; i < 32; i++, dst += stride) {
-    vst1q_u8(dst, q0u8);
-    vst1q_u8(dst + 16, q1u8);
+  for (i = 0; i < 32; i++) {
+    // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8.
+    // clang-3.8 unrolled the loop fully with no filler so the cause is likely
+    // the latency of the instruction.
+    vst1q_u8(dst, d0);
+    dst += 16;
+    vst1q_u8(dst, d1);
+    dst += stride - 16;
   }
 }
 
+// -----------------------------------------------------------------------------
+
 void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d1u32 = vdup_n_u32(0);
+  const uint32x2_t zero = vdup_n_u32(0);
+  const uint8x8_t left_u8 =
+      vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0));
+  uint8x8_t d;
   (void)above;
 
-  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 0);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 1);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 2);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 3);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
 }
 
 void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint64x1_t d1u64 = vdup_n_u64(0);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  uint8x8_t d;
   (void)above;
 
-  d1u64 = vld1_u64((const uint64_t *)left);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 0);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 1);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 2);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 3);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 4);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 5);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 6);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 7);
+  vst1_u8(dst, d);
 }
 
 void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
+  const uint8x16_t left_u8q = vld1q_u8(left);
+  uint8x8_t left_u8d = vget_low_u8(left_u8q);
+  uint8x16_t d;
+  int i;
   (void)above;
 
-  q1u8 = vld1q_u8(left);
-  d2u8 = vget_low_u8(q1u8);
-  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-    q0u8 = vdupq_lane_u8(d2u8, 0);
-    vst1q_u8(dst, q0u8);
+  for (i = 0; i < 2; i++, left_u8d = vget_high_u8(left_u8q)) {
+    d = vdupq_lane_u8(left_u8d, 0);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 1);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 1);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 2);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 2);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 3);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 3);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 4);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 4);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 5);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 5);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 6);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 6);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 7);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 7);
+    vst1q_u8(dst, d);
     dst += stride;
   }
 }
 
 void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
+  uint8x16_t d;
+  int i;
   (void)above;
 
-  for (k = 0; k < 2; k++, left += 16) {
-    q1u8 = vld1q_u8(left);
-    d2u8 = vget_low_u8(q1u8);
-    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-      q0u8 = vdupq_lane_u8(d2u8, 0);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 1);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 2);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 3);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 4);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 5);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 6);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 7);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-    }
+  for (i = 0; i < 2; i++, left += 16) {
+    const uint8x16_t left_u8 = vld1q_u8(left);
+    const uint8x8_t left_low = vget_low_u8(left_u8);
+    const uint8x8_t left_high = vget_high_u8(left_u8);
+    d = vdupq_lane_u8(left_low, 0);
+    vst1q_u8(dst, d);  // Note clang-3.8 produced poor code w/vst2q_u8
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 1);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 2);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 3);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 4);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 5);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 6);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 7);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+
+    d = vdupq_lane_u8(left_high, 0);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 1);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 2);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 3);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 4);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 5);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 6);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 7);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
   }
 }
 
+// -----------------------------------------------------------------------------
+
+static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) {
+  return vreinterpretq_s16_u16(vmovl_u8(v));
+}
+
 void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint16x8_t q1u16, q3u16;
-  int16x8_t q1s16;
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d2u32 = vdup_n_u32(0);
-
-  d0u8 = vld1_dup_u8(above - 1);
-  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
-  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
-  for (i = 0; i < 4; i++, dst += stride) {
-    q1u16 = vdupq_n_u16((uint16_t)left[i]);
-    q1s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
-    d0u8 = vqmovun_s16(q1s16);
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  }
+  const uint8x8_t top_left = vld1_dup_u8(above - 1);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  const uint8x8_t above_u8 = vld1_u8(above);
+  const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8));
+  int16x8_t sub, sum;
+  uint32x2_t d;
+
+  sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+  // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8.
+  sub = vreinterpretq_s16_s64(
+      vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0));
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+  sum = vaddq_s16(sum, sub);
+  d = vreinterpret_u32_u8(vqmovun_s16(sum));
+  vst1_lane_u32((uint32_t *)dst, d, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, d, 1);
+  dst += stride;
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+  sum = vaddq_s16(sum, sub);
+  d = vreinterpret_u32_u8(vqmovun_s16(sum));
+  vst1_lane_u32((uint32_t *)dst, d, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, d, 1);
+}
+
+static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride,
+                               const int16x8_t left_dup, const int16x8_t sub) {
+  const int16x8_t sum = vaddq_s16(left_dup, sub);
+  const uint8x8_t d = vqmovun_s16(sum);
+  vst1_u8(*dst, d);
+  *dst += stride;
 }
 
 void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint16x8_t q0u16, q3u16, q10u16;
-  int16x8_t q0s16;
-  uint16x4_t d20u16;
-  uint8x8_t d0u8, d2u8, d30u8;
-
-  d0u8 = vld1_dup_u8(above - 1);
-  d30u8 = vld1_u8(left);
-  d2u8 = vld1_u8(above);
-  q10u16 = vmovl_u8(d30u8);
-  q3u16 = vsubl_u8(d2u8, d0u8);
-  d20u16 = vget_low_u16(q10u16);
-  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-    q0u16 = vdupq_lane_u16(d20u16, 0);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 1);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 2);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 3);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
+  const uint8x8_t top_left = vld1_dup_u8(above - 1);
+  const uint8x8_t above_u8 = vld1_u8(above);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+  const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+  int16x4_t left_s16d = vget_low_s16(left_s16q);
+  int i;
+
+  for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+    int16x8_t left_dup;
+
+    left_dup = vdupq_lane_s16(left_s16d, 0);
+    tm_8_kernel(&dst, stride, left_dup, sub);
+    left_dup = vdupq_lane_s16(left_s16d, 1);
+    tm_8_kernel(&dst, stride, left_dup, sub);
+    left_dup = vdupq_lane_s16(left_s16d, 2);
+    tm_8_kernel(&dst, stride, left_dup, sub);
+    left_dup = vdupq_lane_s16(left_s16d, 3);
+    tm_8_kernel(&dst, stride, left_dup, sub);
   }
 }
 
+static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1) {
+  const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  const uint8x8_t d0 = vqmovun_s16(sum0);
+  const uint8x8_t d1 = vqmovun_s16(sum1);
+  vst1_u8(*dst, d0);
+  *dst += 8;
+  vst1_u8(*dst, d1);
+  *dst += stride - 8;
+}
+
 void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
-  uint8x16_t q0u8, q1u8;
-  int16x8_t q0s16, q1s16, q8s16, q11s16;
-  uint16x4_t d20u16;
-  uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
-
-  q0u8 = vld1q_dup_u8(above - 1);
-  q1u8 = vld1q_u8(above);
-  q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-  q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-  for (k = 0; k < 2; k++, left += 8) {
-    d18u8 = vld1_u8(left);
-    q10u16 = vmovl_u8(d18u8);
-    d20u16 = vget_low_u16(q10u16);
-    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-      q0u16 = vdupq_lane_u16(d20u16, 0);
-      q8u16 = vdupq_lane_u16(d20u16, 1);
-      q1s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
-      q0s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
-      q11s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
-      q8s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
-      d2u8 = vqmovun_s16(q1s16);
-      d3u8 = vqmovun_s16(q0s16);
-      d22u8 = vqmovun_s16(q11s16);
-      d23u8 = vqmovun_s16(q8s16);
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-      dst += stride;
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d20u16, 2);
-      q8u16 = vdupq_lane_u16(d20u16, 3);
-      q1s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
-      q0s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
-      q11s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
-      q8s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
-      d2u8 = vqmovun_s16(q1s16);
-      d3u8 = vqmovun_s16(q0s16);
-      d22u8 = vqmovun_s16(q11s16);
-      d23u8 = vqmovun_s16(q8s16);
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-      dst += stride;
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-      dst += stride;
-    }
+  const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+  const uint8x16_t above_u8 = vld1q_u8(above);
+  const int16x8_t sub0 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left)));
+  const int16x8_t sub1 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left)));
+  int16x8_t left_dup;
+  int i;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    const uint8x8_t left_u8 = vld1_u8(left);
+    const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+    const int16x4_t left_low = vget_low_s16(left_s16q);
+    const int16x4_t left_high = vget_high_s16(left_s16q);
+
+    left_dup = vdupq_lane_s16(left_low, 0);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_low, 1);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_low, 2);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_low, 3);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+
+    left_dup = vdupq_lane_s16(left_high, 0);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_high, 1);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_high, 2);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_high, 3);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
   }
 }
 
+static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1, const int16x8_t sub2,
+                                const int16x8_t sub3) {
+  const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  const int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+  const int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+  const uint8x8_t d0 = vqmovun_s16(sum0);
+  const uint8x8_t d1 = vqmovun_s16(sum1);
+  const uint8x8_t d2 = vqmovun_s16(sum2);
+  const uint8x8_t d3 = vqmovun_s16(sum3);
+
+  vst1q_u8(*dst, vcombine_u8(d0, d1));
+  *dst += 16;
+  vst1q_u8(*dst, vcombine_u8(d2, d3));
+  *dst += stride - 16;
+}
+
 void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
-  uint8x16_t q0u8, q1u8, q2u8;
-  int16x8_t q12s16, q13s16, q14s16, q15s16;
-  uint16x4_t d6u16;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
-
-  q0u8 = vld1q_dup_u8(above - 1);
-  q1u8 = vld1q_u8(above);
-  q2u8 = vld1q_u8(above + 16);
-  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
-  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
-  for (k = 0; k < 4; k++, left += 8) {
-    d26u8 = vld1_u8(left);
-    q3u16 = vmovl_u8(d26u8);
-    d6u16 = vget_low_u16(q3u16);
-    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
-      q0u16 = vdupq_lane_u16(d6u16, 0);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 1);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 2);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 3);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
+  const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+  const uint8x16_t above_low = vld1q_u8(above);
+  const uint8x16_t above_high = vld1q_u8(above + 16);
+  const int16x8_t sub0 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left)));
+  const int16x8_t sub1 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left)));
+  const int16x8_t sub2 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left)));
+  const int16x8_t sub3 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left)));
+  int16x8_t left_dup;
+  int i, j;
+
+  for (j = 0; j < 4; j++, left += 8) {
+    const uint8x8_t left_u8 = vld1_u8(left);
+    const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+    int16x4_t left_s16d = vget_low_s16(left_s16q);
+    for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+      left_dup = vdupq_lane_s16(left_s16d, 0);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+      left_dup = vdupq_lane_s16(left_s16d, 1);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+      left_dup = vdupq_lane_s16(left_s16d, 2);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+      left_dup = vdupq_lane_s16(left_s16d, 3);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
     }
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_neon.c
index fc080163bb4..7419cea022d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_neon.c
@@ -423,8 +423,8 @@ static INLINE void apply_15_tap_filter_16(
     filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask));               \
                                                                               \
     /* save bottom 3 bits so that we round one side +4 and the other +3 */    \
-    /* if it equals 4 we'll set to adjust by -1 to account for the fact */    \
-    /* we'd round 3 the other way */                                          \
+    /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \
+    /* we'd round it by 3 the other way */                                    \
     filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3);       \
     filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3);       \
                                                                               \
@@ -909,7 +909,7 @@ void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
                                 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
   filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
             &op1, &op0, &oq0, &oq1, &oq2);
-  // Note: tranpose + store_8x8() is faster than store_6x8().
+  // Note: transpose + store_8x8() is faster than store_6x8().
   transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
   store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
 }
@@ -934,7 +934,7 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
                                  p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
   filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
              &op1, &op0, &oq0, &oq1, &oq2);
-  // Note: store_6x8() twice is faster than tranpose + store_8x16().
+  // Note: store_6x8() twice is faster than transpose + store_8x16().
   store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
             vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
   store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
@@ -1037,7 +1037,7 @@ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
                         &s6, &s7);
       store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
     } else {
-      // Note: tranpose + store_8x8() is faster than store_6x8().
+      // Note: transpose + store_8x8() is faster than store_6x8().
       transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
       store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
     }
@@ -1074,7 +1074,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
       store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
                   s13, s14, s15);
     } else {
-      // Note: store_6x8() twice is faster than tranpose + store_8x16().
+      // Note: store_6x8() twice is faster than transpose + store_8x16().
       s += 8;
       store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
                 vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
index 55188c5bc21..445add29689 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -39,6 +39,15 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {
   return b0;
 }
 
+static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
+  uint16x8x2_t b0;
+  b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+                           vreinterpret_u16_u32(vget_low_u32(a1)));
+  b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+                           vreinterpret_u16_u32(vget_high_u32(a1)));
+  return b0;
+}
+
 static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
   // Swap 16 bit elements. Goes from:
   // a0: 00 01 02 03  10 11 12 13
@@ -68,6 +77,70 @@ static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
   *a1 = d0.val[1];
 }
 
+static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
+                                      int16x4_t *a2, int16x4_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+  const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+
+  *a0 = vreinterpret_s16_s32(c0.val[0]);
+  *a1 = vreinterpret_s16_s32(c1.val[0]);
+  *a2 = vreinterpret_s16_s32(c0.val[1]);
+  *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03  10 11 12 13
+  // a1: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const uint32x4x2_t b0 =
+      vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1));
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21  02 03 22 23
+  // c0.val[1]: 10 11 30 31  12 13 32 33
+
+  const uint32x4_t c0 =
+      vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1]));
+  const uint32x4_t c1 =
+      vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1]));
+
+  // Swap 16 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30  02 12 22 32
+  // d0.val[1]: 01 11 21 31  03 13 23 33
+
+  const uint16x8x2_t d0 =
+      vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+}
+
 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
                                     uint8x8_t *a3) {
   // Swap 8 bit elements. Goes from:
@@ -101,6 +174,39 @@ static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
   *a3 = vreinterpret_u8_u16(c1.val[1]);
 }
 
+static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1,
+                                     uint16x8_t *a2, uint16x8_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+  const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+
+  *a0 = vreinterpretq_u16_u32(c0.val[0]);
+  *a1 = vreinterpretq_u16_u32(c1.val[0]);
+  *a2 = vreinterpretq_u16_u32(c0.val[1]);
+  *a3 = vreinterpretq_u16_u32(c1.val[1]);
+}
+
 // Note: Using 'd' registers or 'q' registers has almost identical speed. We use
 // 'q' registers here to save some instructions.
 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
@@ -228,6 +334,77 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
   *a7 = d3.val[1];
 }
 
+static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
+                                     uint16x8_t *a2, uint16x8_t *a3,
+                                     uint16x8_t *a4, uint16x8_t *a5,
+                                     uint16x8_t *a6, uint16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+  const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+  const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+  const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+  const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+                                    vreinterpretq_u32_u16(b3.val[0]));
+  const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+                                    vreinterpretq_u32_u16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+  const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
+  const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
+  const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
+  const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
+
+  *a0 = d0.val[0];
+  *a1 = d1.val[0];
+  *a2 = d2.val[0];
+  *a3 = d3.val[0];
+  *a4 = d0.val[1];
+  *a5 = d1.val[1];
+  *a6 = d2.val[1];
+  *a7 = d3.val[1];
+}
+
 static INLINE void transpose_u8_16x8(
     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
index e16d33718aa..1386838eea6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -820,14 +820,14 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
         s += src_stride;
 
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+        __builtin_prefetch(s + 0 * src_stride);
+        __builtin_prefetch(s + 1 * src_stride);
+        __builtin_prefetch(s + 2 * src_stride);
+        __builtin_prefetch(s + 3 * src_stride);
         d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
                          filter4);
         d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
@@ -1002,14 +1002,14 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
         s += src_stride;
 
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+        __builtin_prefetch(s + 0 * src_stride);
+        __builtin_prefetch(s + 1 * src_stride);
+        __builtin_prefetch(s + 2 * src_stride);
+        __builtin_prefetch(s + 3 * src_stride);
         d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
                          filter4);
         d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
index 5d7fa54fcd4..6ca0e501b3c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -24,16 +24,15 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
 
   // Account for the vertical phase needing 3 lines prior and 4 lines post
-  int intermediate_height = h + 7;
+  const int intermediate_height = h + 7;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* Filter starting 3 lines back. The neon implementation will ignore the
-   * given height and filter a multiple of 4 lines. Since this goes in to
-   * the temp buffer which has lots of extra room and is subsequently discarded
-   * this is safe if somewhat less than ideal.
-   */
+  /* Filter starting 3 lines back. The neon implementation will ignore the given
+   * height and filter a multiple of 4 lines. Since this goes in to the temp
+   * buffer which has lots of extra room and is subsequently discarded this is
+   * safe if somewhat less than ideal.   */
   vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,
                            x_step_q4, filter_y, y_step_q4, w,
                            intermediate_height);
@@ -49,7 +48,7 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_y, int y_step_q4, int w,
                             int h) {
   DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
-  int intermediate_height = h + 7;
+  const int intermediate_height = h + 7;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/fwd_txfm.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/fwd_txfm.c
index 4e7d4053ea9..aa59601094d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/fwd_txfm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/fwd_txfm.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/fwd_txfm.h"
 
@@ -21,36 +22,37 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // We need an intermediate buffer between passes.
   tran_low_t intermediate[4 * 4];
-  const int16_t *in_pass0 = input;
-  const tran_low_t *in = NULL;
+  const tran_low_t *in_low = NULL;
   tran_low_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
-    tran_high_t input[4];      // canbe16
+    tran_high_t in_high[4];    // canbe16
     tran_high_t step[4];       // canbe16
     tran_high_t temp1, temp2;  // needs32
     int i;
     for (i = 0; i < 4; ++i) {
       // Load inputs.
-      if (0 == pass) {
-        input[0] = in_pass0[0 * stride] * 16;
-        input[1] = in_pass0[1 * stride] * 16;
-        input[2] = in_pass0[2 * stride] * 16;
-        input[3] = in_pass0[3 * stride] * 16;
-        if (i == 0 && input[0]) {
-          input[0] += 1;
+      if (pass == 0) {
+        in_high[0] = input[0 * stride] * 16;
+        in_high[1] = input[1 * stride] * 16;
+        in_high[2] = input[2 * stride] * 16;
+        in_high[3] = input[3 * stride] * 16;
+        if (i == 0 && in_high[0]) {
+          ++in_high[0];
         }
       } else {
-        input[0] = in[0 * 4];
-        input[1] = in[1 * 4];
-        input[2] = in[2 * 4];
-        input[3] = in[3 * 4];
+        assert(in_low != NULL);
+        in_high[0] = in_low[0 * 4];
+        in_high[1] = in_low[1 * 4];
+        in_high[2] = in_low[2 * 4];
+        in_high[3] = in_low[3 * 4];
+        ++in_low;
       }
       // Transform.
-      step[0] = input[0] + input[3];
-      step[1] = input[1] + input[2];
-      step[2] = input[1] - input[2];
-      step[3] = input[0] - input[3];
+      step[0] = in_high[0] + in_high[3];
+      step[1] = in_high[1] + in_high[2];
+      step[2] = in_high[1] - in_high[2];
+      step[3] = in_high[0] - in_high[3];
       temp1 = (step[0] + step[1]) * cospi_16_64;
       temp2 = (step[0] - step[1]) * cospi_16_64;
       out[0] = (tran_low_t)fdct_round_shift(temp1);
@@ -60,12 +62,11 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
       out[1] = (tran_low_t)fdct_round_shift(temp1);
       out[3] = (tran_low_t)fdct_round_shift(temp2);
       // Do next column (which is a transposed row in second/horizontal pass)
-      in_pass0++;
-      in++;
+      ++input;
       out += 4;
     }
     // Setup in/out for next pass.
-    in = intermediate;
+    in_low = intermediate;
     out = output;
   }
 
@@ -99,7 +100,6 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
     tran_high_t t0, t1, t2, t3;                  // needs32
     tran_high_t x0, x1, x2, x3;                  // canbe16
 
-    int i;
     for (i = 0; i < 8; i++) {
       // stage 1
       if (pass == 0) {
@@ -190,56 +190,57 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // We need an intermediate buffer between passes.
   tran_low_t intermediate[256];
-  const int16_t *in_pass0 = input;
-  const tran_low_t *in = NULL;
+  const tran_low_t *in_low = NULL;
   tran_low_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
     tran_high_t step1[8];      // canbe16
     tran_high_t step2[8];      // canbe16
     tran_high_t step3[8];      // canbe16
-    tran_high_t input[8];      // canbe16
+    tran_high_t in_high[8];    // canbe16
     tran_high_t temp1, temp2;  // needs32
     int i;
     for (i = 0; i < 16; i++) {
       if (0 == pass) {
         // Calculate input for the first 8 results.
-        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
-        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
-        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
-        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
-        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
-        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
-        input[6] = (in_pass0[6 * stride] + in_pass0[9 * stride]) * 4;
-        input[7] = (in_pass0[7 * stride] + in_pass0[8 * stride]) * 4;
+        in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
+        in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
+        in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
+        in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
+        in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
+        in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
+        in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
+        in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
         // Calculate input for the next 8 results.
-        step1[0] = (in_pass0[7 * stride] - in_pass0[8 * stride]) * 4;
-        step1[1] = (in_pass0[6 * stride] - in_pass0[9 * stride]) * 4;
-        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
-        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
-        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
-        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
-        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
-        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
+        step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
+        step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
+        step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
+        step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
+        step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
+        step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
+        step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
+        step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
       } else {
         // Calculate input for the first 8 results.
-        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
-        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
-        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
-        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
-        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
-        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
-        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[9 * 16] + 1) >> 2);
-        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[8 * 16] + 1) >> 2);
+        assert(in_low != NULL);
+        in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
+        in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
+        in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
+        in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
+        in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
+        in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
+        in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
+        in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
         // Calculate input for the next 8 results.
-        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[8 * 16] + 1) >> 2);
-        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[9 * 16] + 1) >> 2);
-        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
-        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
-        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
-        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
-        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
-        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+        step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
+        step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
+        step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
+        step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
+        step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
+        step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
+        step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
+        step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
+        in_low++;
       }
       // Work on the first eight values; fdct8(input, even_results);
       {
@@ -248,14 +249,14 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         tran_high_t x0, x1, x2, x3;                  // canbe16
 
         // stage 1
-        s0 = input[0] + input[7];
-        s1 = input[1] + input[6];
-        s2 = input[2] + input[5];
-        s3 = input[3] + input[4];
-        s4 = input[3] - input[4];
-        s5 = input[2] - input[5];
-        s6 = input[1] - input[6];
-        s7 = input[0] - input[7];
+        s0 = in_high[0] + in_high[7];
+        s1 = in_high[1] + in_high[6];
+        s2 = in_high[2] + in_high[5];
+        s3 = in_high[3] + in_high[4];
+        s4 = in_high[3] - in_high[4];
+        s5 = in_high[2] - in_high[5];
+        s6 = in_high[1] - in_high[6];
+        s7 = in_high[0] - in_high[7];
 
         // fdct4(step, step);
         x0 = s0 + s3;
@@ -350,12 +351,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         out[15] = (tran_low_t)fdct_round_shift(temp2);
       }
       // Do next column (which is a transposed row in second/horizontal pass)
-      in++;
-      in_pass0++;
+      input++;
       out += 16;
     }
     // Setup in/out for next pass.
-    in = intermediate;
+    in_low = intermediate;
     out = output;
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
index 46ddd1da0d0..f3f543ddfe8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
@@ -96,6 +96,7 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
 void idct4_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step[4];
   tran_high_t temp1, temp2;
+
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
@@ -114,9 +115,9 @@ void idct4_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
   tran_low_t out[4 * 4];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[4], temp_out[4];
 
   // Rows
@@ -142,6 +143,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
   int i;
   tran_high_t a1;
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
@@ -157,6 +159,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
 void idct8_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[8], step2[8];
   tran_high_t temp1, temp2;
+
   // stage 1
   step1[0] = input[0];
   step1[2] = input[4];
@@ -209,9 +212,9 @@ void idct8_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
 
   // First transform rows
@@ -236,6 +239,7 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
@@ -246,14 +250,13 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 
 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
   tran_low_t x0 = input[0];
   tran_low_t x1 = input[1];
   tran_low_t x2 = input[2];
   tran_low_t x3 = input[3];
 
   if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
+    memset(output, 0, 4 * sizeof(*output));
     return;
   }
 
@@ -283,7 +286,6 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
 
 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
-
   tran_high_t x0 = input[7];
   tran_high_t x1 = input[0];
   tran_high_t x2 = input[5];
@@ -294,8 +296,7 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x7 = input[6];
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = 0;
+    memset(output, 0, 8 * sizeof(*output));
     return;
   }
 
@@ -359,13 +360,13 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
 
   // First transform rows
-  // only first 4 row has non-zero coefs
+  // Only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
     idct8_c(input, outptr);
     input += 8;
@@ -550,9 +551,9 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
                              int stride) {
+  int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
 
   // First transform rows
@@ -576,7 +577,6 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
   tran_high_t x0 = input[15];
   tran_high_t x1 = input[0];
   tran_high_t x2 = input[13];
@@ -596,9 +596,7 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
         x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = output[8] = output[9] = output[10] =
-            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
+    memset(output, 0, 16 * sizeof(*output));
     return;
   }
 
@@ -746,9 +744,9 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
+  int i, j;
   tran_low_t out[16 * 16] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
 
   // First transform rows. Since all non-zero dct coefficients are in
@@ -774,6 +772,7 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
@@ -1151,9 +1150,9 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
                               int stride) {
+  int i, j;
   tran_low_t out[32 * 32];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
@@ -1188,13 +1187,13 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
                              int stride) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
-  // only upper-left 16x16 has non-zero coeff
+  // Only upper-left 16x16 has non-zero coeff
   for (i = 0; i < 16; ++i) {
     idct32_c(input, outptr);
     input += 32;
@@ -1214,13 +1213,13 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
-  // only upper-left 8x8 has non-zero coeff
+  // Only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
     idct32_c(input, outptr);
     input += 32;
@@ -1241,8 +1240,8 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
@@ -1373,12 +1372,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2
   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
@@ -1389,9 +1388,9 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                  int stride, int bd) {
+  int i, j;
   tran_low_t out[4 * 4];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[4], temp_out[4];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
@@ -1418,10 +1417,10 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i;
   tran_high_t a1;
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1452,12 +1451,12 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2 & stage 3 - even half
   vpx_highbd_idct4_c(step1, step1, bd);
@@ -1472,8 +1471,8 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   // stage 4
@@ -1489,20 +1488,20 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
                                  int stride, int bd) {
+  int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // First transform rows.
+  // First transform rows
   for (i = 0; i < 8; ++i) {
     vpx_highbd_idct8_c(input, outptr, bd);
     input += 8;
     outptr += 8;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     vpx_highbd_idct8_c(temp_in, temp_out, bd);
@@ -1518,9 +1517,10 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i, j;
   tran_high_t a1;
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -1567,10 +1567,10 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
-  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
-  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
+  output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
+  output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
+  output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
 }
 
 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1608,14 +1608,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
 
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
+  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
+  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
 
   // stage 2
   s0 = x0;
@@ -1631,10 +1631,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1642,10 +1642,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
 
   output[0] = HIGHBD_WRAPLOW(x0, bd);
   output[1] = HIGHBD_WRAPLOW(-x4, bd);
@@ -1657,22 +1657,23 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   output[7] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
-void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
                                  int stride, int bd) {
+  int i, j;
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // First transform rows.
-  // Only first 4 row has non-zero coefs.
+  // First transform rows
+  // Only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
     vpx_highbd_idct8_c(input, outptr, bd);
     input += 8;
     outptr += 8;
   }
-  // Then transform columns.
+
+  // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     vpx_highbd_idct8_c(temp_in, temp_out, bd);
@@ -1726,23 +1727,23 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1752,12 +1753,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
@@ -1771,12 +1772,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
@@ -1786,12 +1787,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -1803,8 +1804,8 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
@@ -1829,12 +1830,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -1859,20 +1860,20 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
                                     int stride, int bd) {
+  int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // First transform rows.
+  // First transform rows
   for (i = 0; i < 16; ++i) {
     vpx_highbd_idct16_c(input, outptr, bd);
     input += 16;
     outptr += 16;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     vpx_highbd_idct16_c(temp_in, temp_out, bd);
@@ -1936,22 +1937,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
 
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
-  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
-  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
+  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
+  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
 
   // stage 2
   s0 = x0;
@@ -1979,14 +1980,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
   x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
   x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
-  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
-  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
 
   // stage 3
   s0 = x0;
@@ -2010,18 +2011,18 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
   x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
   x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
   x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
   x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
   s2 = (-cospi_16_64) * (x2 + x3);
@@ -2033,14 +2034,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = (-cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
 
   output[0] = HIGHBD_WRAPLOW(x0, bd);
   output[1] = HIGHBD_WRAPLOW(-x8, bd);
@@ -2062,9 +2063,9 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int bd) {
+  int i, j;
   tran_low_t out[16 * 16] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
@@ -2076,7 +2077,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
     outptr += 16;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     vpx_highbd_idct16_c(temp_in, temp_out, bd);
@@ -2092,10 +2093,10 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i, j;
   tran_high_t a1;
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -2137,43 +2138,43 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2187,23 +2188,23 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
@@ -2230,12 +2231,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
@@ -2250,22 +2251,22 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2274,12 +2275,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
@@ -2289,12 +2290,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -2324,8 +2325,8 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
@@ -2341,20 +2342,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2375,12 +2376,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -2426,20 +2427,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
@@ -2482,9 +2483,9 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
                                      int stride, int bd) {
+  int i, j;
   tran_low_t out[32 * 32];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
@@ -2520,19 +2521,20 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
 
 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int bd) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Rows
-  // Only upper-left 8x8 has non-zero coeff.
+  // Only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
     highbd_idct32_c(input, outptr, bd);
     input += 32;
     outptr += 32;
   }
+
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
@@ -2549,10 +2551,10 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i, j;
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
@@ -2560,4 +2562,5 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
     dest += stride;
   }
 }
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.h
index e530730d575..13137659fae 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.h
@@ -57,11 +57,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
   (void)bd;
   return input;
 }
-
-static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return (tran_high_t)rv;
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_EMULATE_HARDWARE
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/loopfilter.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/loopfilter.c
index 60a15e23bcf..9866ea37d6d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/loopfilter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/loopfilter.c
@@ -94,8 +94,8 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
   filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
 
   // save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way
+  // if it equals 4 we'll set it to adjust by -1 to account for the fact
+  // we'd round it by 3 the other way
   filter1 = signed_char_clamp(filter + 4) >> 3;
   filter2 = signed_char_clamp(filter + 3) >> 3;
 
@@ -425,8 +425,8 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
   filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
 
   // Save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way.
+  // if it equals 4 we'll set it to adjust by -1 to account for the fact
+  // we'd round it by 3 the other way.
   filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
   filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
index 402d7ed9979..cc633c6698d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
@@ -454,7 +454,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
   v16u8 tmp = { 0 };
   v16i8 zero = { 0 };
   v8u16 sum_h, src_r_h, src_l_h;
-  v4u32 src_r_w, src_l_w;
+  v4u32 src_r_w;
   v4i32 flimit_vec;
 
   flimit_vec = __msa_fill_w(flimit);
@@ -473,9 +473,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
     src[15] = 0;
     ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
     src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
-    src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
+    src_r_w += __msa_dotp_u_w(src_l_h, src_l_h);
     sum_sq = HADD_SW_S32(src_r_w);
-    sum_sq += HADD_SW_S32(src_l_w);
     sum_h = __msa_hadd_u_h(src, src);
     sum = HADD_UH_U32(sum_h);
     {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_msa.h
index 8b8a4173d2f..1fe9b28e8ad 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_msa.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_msa.h
@@ -196,18 +196,18 @@
                  out2, out3)                                                  \
   {                                                                           \
     v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
-    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
+    v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd;                         \
                                                                               \
     ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                            \
     ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                            \
     DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
-                cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
-    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
-    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);                  \
+                cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
+    SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
+    PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1);      \
     DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
-                cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
-    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
-    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);                  \
+                cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
+    SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
+    PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3);      \
   }
 
 /* idct 8x8 macro */
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
index 6ee2456ca5d..b73d56bd558 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
@@ -449,7 +449,7 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
                  mask, flat);
     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
                        q1_out);
 
     flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
@@ -779,7 +779,7 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
   /* flat4 */
   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   /* filter4 */
-  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
index e0079665f71..9500cd2fd86 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
@@ -27,7 +27,7 @@ void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
 
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
                mask, flat);
-  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
   p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
   p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
@@ -86,7 +86,7 @@ void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
                      q3);
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
                mask, flat);
-  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
   ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
   ILVRL_H2_SH(vec1, vec0, vec2, vec3);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
index 403e5dc51b3..a22c62bb3a3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
@@ -32,7 +32,7 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
                mask, flat);
   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
 
@@ -177,7 +177,7 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
   /* flat4 */
   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   /* filter4 */
-  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_msa.h
index 4063e5e6b87..49fd74c25a4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_msa.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_msa.h
@@ -13,144 +13,71 @@
 
 #include "vpx_dsp/mips/macros_msa.h"
 
-#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
-                           p1_out, p0_out, q0_out, q1_out)              \
-  {                                                                     \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
-    v8i16 q0_sub_p0_r, filt_r, cnst3h;                                  \
-                                                                        \
-    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
-    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
-    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
-    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
-                                                                        \
-    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
-    filt = filt & (v16i8)hev_in;                                        \
-    q0_sub_p0 = q0_m - p0_m;                                            \
-    filt_sign = __msa_clti_s_b(filt, 0);                                \
-                                                                        \
-    cnst3h = __msa_ldi_h(3);                                            \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
-    filt_r += q0_sub_p0_r;                                              \
-    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
-                                                                        \
-    /* combine left and right part */                                   \
-    filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r);                 \
-                                                                        \
-    filt = filt & (v16i8)mask_in;                                       \
-    cnst4b = __msa_ldi_b(4);                                            \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
-    filt1 >>= 3;                                                        \
-                                                                        \
-    cnst3b = __msa_ldi_b(3);                                            \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
-    filt2 >>= 3;                                                        \
-                                                                        \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
-    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
-    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
-                                                                        \
-    filt = __msa_srari_b(filt1, 1);                                     \
-    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
-    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
-    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
-    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
-  }
-
-#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
-                           p1_out, p0_out, q0_out, q1_out)              \
-  {                                                                     \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
-    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
-                                                                        \
-    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
-    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
-    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
-    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
-                                                                        \
-    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
-                                                                        \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q0_sub_p0 = q0_m - p0_m;                                            \
-    filt_sign = __msa_clti_s_b(filt, 0);                                \
-                                                                        \
-    cnst3h = __msa_ldi_h(3);                                            \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
-    filt_r += q0_sub_p0_r;                                              \
-    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
-                                                                        \
-    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);    \
-    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                      \
-    filt_l += q0_sub_p0_l;                                              \
-    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
-                                                                        \
-    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                 \
-    filt = filt & (v16i8)mask_in;                                       \
-                                                                        \
-    cnst4b = __msa_ldi_b(4);                                            \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
-    filt1 >>= 3;                                                        \
-                                                                        \
-    cnst3b = __msa_ldi_b(3);                                            \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
-    filt2 >>= 3;                                                        \
-                                                                        \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
-    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
-    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
-                                                                        \
-    filt = __msa_srari_b(filt1, 1);                                     \
-    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
-    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
-    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
-    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \
+                           p0_out, q0_out, q1_out)                        \
+  {                                                                       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2;                \
+    const v16i8 cnst4b = __msa_ldi_b(4);                                  \
+    const v16i8 cnst3b = __msa_ldi_b(3);                                  \
+                                                                          \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                              \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                              \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                              \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                              \
+                                                                          \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                    \
+    filt &= hev;                                                          \
+    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);                               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
+    filt &= mask;                                                         \
+    t1 = __msa_adds_s_b(filt, cnst4b);                                    \
+    t1 >>= cnst3b;                                                        \
+    t2 = __msa_adds_s_b(filt, cnst3b);                                    \
+    t2 >>= cnst3b;                                                        \
+    q0_m = __msa_subs_s_b(q0_m, t1);                                      \
+    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                             \
+    p0_m = __msa_adds_s_b(p0_m, t2);                                      \
+    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                             \
+    filt = __msa_srari_b(t1, 1);                                          \
+    hev = __msa_xori_b(hev, 0xff);                                        \
+    filt &= hev;                                                          \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                    \
+    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                             \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                    \
+    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                             \
   }
 
-#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
-  {                                                                   \
-    v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;    \
-    v16u8 zero_in = { 0 };                                            \
-                                                                      \
-    tmp = __msa_ori_b(zero_in, 1);                                    \
-    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                       \
-    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                       \
-    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                       \
-    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                       \
-                                                                      \
-    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);            \
-    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                  \
-    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);            \
-    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                  \
-                                                                      \
-    flat_out = (tmp < (v16u8)flat_out);                               \
-    flat_out = __msa_xori_b(flat_out, 0xff);                          \
-    flat_out = flat_out & (mask);                                     \
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)    \
+  {                                                                      \
+    v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
+    v16u8 zero_in = { 0 };                                               \
+                                                                         \
+    tmp_flat4 = __msa_ori_b(zero_in, 1);                                 \
+    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                          \
+    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                          \
+    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                          \
+    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                          \
+                                                                         \
+    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);               \
+    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                     \
+    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);               \
+    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                     \
+                                                                         \
+    flat_out = (tmp_flat4 < (v16u8)flat_out);                            \
+    flat_out = __msa_xori_b(flat_out, 0xff);                             \
+    flat_out = flat_out & (mask);                                        \
   }
 
 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
                   q6_in, q7_in, flat_in, flat2_out)                       \
   {                                                                       \
-    v16u8 tmp, zero_in = { 0 };                                           \
+    v16u8 tmp_flat5, zero_in = { 0 };                                     \
     v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;             \
     v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;             \
                                                                           \
-    tmp = __msa_ori_b(zero_in, 1);                                        \
+    tmp_flat5 = __msa_ori_b(zero_in, 1);                                  \
     p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                           \
     q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                           \
     p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                           \
@@ -168,7 +95,7 @@
     p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);                \
     flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                    \
                                                                           \
-    flat2_out = (tmp < (v16u8)flat2_out);                                 \
+    flat2_out = (tmp_flat5 < (v16u8)flat2_out);                           \
     flat2_out = __msa_xori_b(flat2_out, 0xff);                            \
     flat2_out = flat2_out & flat_in;                                      \
   }
@@ -177,38 +104,38 @@
                     p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
                     q1_filt8_out, q2_filt8_out)                             \
   {                                                                         \
-    v8u16 tmp0, tmp1, tmp2;                                                 \
+    v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                            \
                                                                             \
-    tmp2 = p2_in + p1_in + p0_in;                                           \
-    tmp0 = p3_in << 1;                                                      \
+    tmp_filt8_2 = p2_in + p1_in + p0_in;                                    \
+    tmp_filt8_0 = p3_in << 1;                                               \
                                                                             \
-    tmp0 = tmp0 + tmp2 + q0_in;                                             \
-    tmp1 = tmp0 + p3_in + p2_in;                                            \
-    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in;                        \
+    tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in;                              \
+    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp1 = tmp0 + p1_in + q1_in;                                            \
-    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in;                              \
+    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp1 = q2_in + q1_in + q0_in;                                           \
-    tmp2 = tmp2 + tmp1;                                                     \
-    tmp0 = tmp2 + (p0_in);                                                  \
-    tmp0 = tmp0 + (p3_in);                                                  \
-    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3);                    \
+    tmp_filt8_1 = q2_in + q1_in + q0_in;                                    \
+    tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1;                                \
+    tmp_filt8_0 = tmp_filt8_2 + (p0_in);                                    \
+    tmp_filt8_0 = tmp_filt8_0 + (p3_in);                                    \
+    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3);             \
                                                                             \
-    tmp0 = q2_in + q3_in;                                                   \
-    tmp0 = p0_in + tmp1 + tmp0;                                             \
-    tmp1 = q3_in + q3_in;                                                   \
-    tmp1 = tmp1 + tmp0;                                                     \
-    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_0 = q2_in + q3_in;                                            \
+    tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0;                        \
+    tmp_filt8_1 = q3_in + q3_in;                                            \
+    tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0;                                \
+    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp0 = tmp2 + q3_in;                                                    \
-    tmp1 = tmp0 + q0_in;                                                    \
-    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_0 = tmp_filt8_2 + q3_in;                                      \
+    tmp_filt8_1 = tmp_filt8_0 + q0_in;                                      \
+    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp1 = tmp0 - p2_in;                                                    \
-    tmp0 = q1_in + q3_in;                                                   \
-    tmp1 = tmp0 + tmp1;                                                     \
-    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_1 = tmp_filt8_0 - p2_in;                                      \
+    tmp_filt8_0 = q1_in + q3_in;                                            \
+    tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1;                                \
+    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
   }
 
 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h
index f498fbe9de2..002e574aa8f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h
@@ -168,20 +168,20 @@
     val_m;                                                 \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m1);                                   \
-    val1_m = LW(psrc_m1 + 4);                               \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                              \
+  ({                                                                          \
+    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
+    uint32_t val0_m, val1_m;                                                  \
+    uint64_t val_m_combined = 0;                                              \
+                                                                              \
+    val0_m = LW(psrc_m1);                                                     \
+    val1_m = LW(psrc_m1 + 4);                                                 \
+                                                                              \
+    val_m_combined = (uint64_t)(val1_m);                                      \
+    val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
+    val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
+                                                                              \
+    val_m_combined;                                                           \
   })
 #endif  // (__mips == 64)
 
@@ -909,27 +909,42 @@
     sum_m;                                         \
   })
 
-/* Description : Horizontal addition of 8 unsigned halfword elements
-   Arguments   : Inputs  - in       (unsigned halfword vector)
-                 Outputs - sum_m    (u32 sum)
-                 Return Type - unsigned word
-   Details     : 8 unsigned halfword elements of input vector are added
-                 together and the resulting integer sum is returned
+/* Description : Horizontal addition of 4 unsigned word elements
+   Arguments   : Input  - in       (unsigned word vector)
+                 Output - sum_m    (u32 sum)
+                 Return Type - unsigned word (GP)
+   Details     : 4 unsigned word elements of 'in' vector are added together and
+                 the resulting integer sum is returned
 */
-#define HADD_UH_U32(in)                               \
+#define HADD_UW_U32(in)                               \
   ({                                                  \
-    v4u32 res_m;                                      \
     v2u64 res0_m, res1_m;                             \
     uint32_t sum_m;                                   \
                                                       \
-    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);     \
-    res0_m = __msa_hadd_u_d(res_m, res_m);            \
+    res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);    \
     res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
-    res0_m = res0_m + res1_m;                         \
+    res0_m += res1_m;                                 \
     sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
     sum_m;                                            \
   })
 
+/* Description : Horizontal addition of 8 unsigned halfword elements
+   Arguments   : Input  - in       (unsigned halfword vector)
+                 Output - sum_m    (u32 sum)
+                 Return Type - unsigned word
+   Details     : 8 unsigned halfword elements of 'in' vector are added
+                 together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in)                           \
+  ({                                              \
+    v4u32 res_m;                                  \
+    uint32_t sum_m;                               \
+                                                  \
+    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
+    sum_m = HADD_UW_U32(res_m);                   \
+    sum_m;                                        \
+  })
+
 /* Description : Horizontal addition of unsigned byte vector elements
    Arguments   : Inputs  - in0, in1
                  Outputs - out0, out1
@@ -2019,13 +2034,12 @@
                                 pdst, stride)                               \
   {                                                                         \
     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                                    \
                                                                             \
     tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
     tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
     ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
     AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                               \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
   }
 
 /* Description : Pack even byte elements and store byte vector in destination
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_msa.c
index 6455814e1b8..e295123acf0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_msa.c
@@ -1030,6 +1030,7 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
   v8u16 sad2_1 = { 0 };
   v8u16 sad3_0 = { 0 };
   v8u16 sad3_1 = { 0 };
+  v4u32 sad;
 
   ref0_ptr = aref_ptr[0];
   ref1_ptr = aref_ptr[1];
@@ -1061,14 +1062,21 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
     sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
   }
 
-  sad_array[0] = HADD_UH_U32(sad0_0);
-  sad_array[0] += HADD_UH_U32(sad0_1);
-  sad_array[1] = HADD_UH_U32(sad1_0);
-  sad_array[1] += HADD_UH_U32(sad1_1);
-  sad_array[2] = HADD_UH_U32(sad2_0);
-  sad_array[2] += HADD_UH_U32(sad2_1);
-  sad_array[3] = HADD_UH_U32(sad3_0);
-  sad_array[3] += HADD_UH_U32(sad3_1);
+  sad = __msa_hadd_u_w(sad0_0, sad0_0);
+  sad += __msa_hadd_u_w(sad0_1, sad0_1);
+  sad_array[0] = HADD_UW_U32(sad);
+
+  sad = __msa_hadd_u_w(sad1_0, sad1_0);
+  sad += __msa_hadd_u_w(sad1_1, sad1_1);
+  sad_array[1] = HADD_UW_U32(sad);
+
+  sad = __msa_hadd_u_w(sad2_0, sad2_0);
+  sad += __msa_hadd_u_w(sad2_1, sad2_1);
+  sad_array[2] = HADD_UW_U32(sad);
+
+  sad = __msa_hadd_u_w(sad3_0, sad3_0);
+  sad += __msa_hadd_u_w(sad3_1, sad3_1);
+  sad_array[3] = HADD_UW_U32(sad);
 }
 
 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_msa.c
index 085990e4845..49b2f99230f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_msa.c
@@ -489,27 +489,19 @@ static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
 
 uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
                               const uint8_t *ref_ptr, int32_t ref_stride) {
-  uint32_t err = 0;
   uint32_t src0, src1, src2, src3;
   uint32_t ref0, ref1, ref2, ref3;
   v16i8 src = { 0 };
   v16i8 ref = { 0 };
-  v16u8 src_vec0, src_vec1;
-  v8i16 diff0, diff1;
   v4i32 err0 = { 0 };
-  v4i32 err1 = { 0 };
 
   LW4(src_ptr, src_stride, src0, src1, src2, src3);
   LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
   INSERT_W4_SB(src0, src1, src2, src3, src);
   INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
-  ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
-  HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
-  DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
-  err = HADD_SW_S32(err0);
-  err += HADD_SW_S32(err1);
+  CALC_MSE_B(src, ref, err0);
 
-  return err;
+  return HADD_SW_S32(err0);
 }
 
 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
index 198c21ed20a..f75679521a4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
@@ -16,18 +16,18 @@
 
 extern const uint8_t mc_filt_mask_arr[16 * 3];
 
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \
-                            filt3)                                       \
-  ({                                                                     \
-    v8i16 tmp0, tmp1;                                                    \
-                                                                         \
-    tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);                    \
-    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1);             \
-    tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);                    \
-    tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3);             \
-    tmp0 = __msa_adds_s_h(tmp0, tmp1);                                   \
-                                                                         \
-    tmp0;                                                                \
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2,   \
+                            filt3)                                         \
+  ({                                                                       \
+    v8i16 tmp_dpadd_0, tmp_dpadd_1;                                        \
+                                                                           \
+    tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);               \
+    tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \
+    tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);               \
+    tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \
+    tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1);                \
+                                                                           \
+    tmp_dpadd_0;                                                           \
   })
 
 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0,       \
@@ -114,11 +114,10 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
                            stride)                                           \
   {                                                                          \
     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                                     \
                                                                              \
     PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                         \
     PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
     AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);             \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                                \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                  \
   }
 #endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c
index f17281b3698..cab6368e606 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c
@@ -25,6 +25,7 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                            int x_step_q4, int w, int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
+
   for (y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
@@ -46,6 +47,7 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
                                int x_step_q4, int w, int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
+
   for (y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
@@ -72,7 +74,7 @@ static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
   for (x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k)
@@ -95,7 +97,7 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
   for (x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k)
@@ -128,8 +130,8 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[135 * 64];
-  int intermediate_height =
+  uint8_t temp[64 * 135];
+  const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= 64);
@@ -143,16 +145,6 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                 y_filters, y0_q4, y_step_q4, w, h);
 }
 
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
@@ -219,7 +211,6 @@ void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                      int w, int h) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
   const InterpKernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
 
@@ -231,7 +222,7 @@ void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, const int16_t *filter_x,
                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
+  // Fixed size intermediate buffer places limits on parameters.
   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
   assert(w <= 64);
   assert(h <= 64);
@@ -272,7 +263,6 @@ void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 
   for (y = 0; y < h; ++y) {
     for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-
     src += src_stride;
     dst += dst_stride;
   }
@@ -334,9 +324,10 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                   const InterpKernel *x_filters, int x0_q4,
                                   int x_step_q4, int w, int h, int bd) {
   int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= SUBPEL_TAPS / 2 - 1;
+
   for (y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
@@ -357,9 +348,10 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                       const InterpKernel *x_filters, int x0_q4,
                                       int x_step_q4, int w, int h, int bd) {
   int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= SUBPEL_TAPS / 2 - 1;
+
   for (y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
@@ -382,9 +374,10 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                  const InterpKernel *y_filters, int y0_q4,
                                  int y_step_q4, int w, int h, int bd) {
   int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
   for (x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
@@ -407,9 +400,10 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                      const InterpKernel *y_filters, int y0_q4,
                                      int y_step_q4, int w, int h, int bd) {
   int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
   for (x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
@@ -447,7 +441,7 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   uint16_t temp[64 * 135];
-  int intermediate_height =
+  const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= 64);
@@ -470,6 +464,7 @@ void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                   int h, int bd) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
   (void)filter_y;
   (void)y_step_q4;
 
@@ -484,6 +479,7 @@ void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                       int w, int h, int bd) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
   (void)filter_y;
   (void)y_step_q4;
 
@@ -498,6 +494,7 @@ void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                  int h, int bd) {
   const InterpKernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
   (void)filter_x;
   (void)x_step_q4;
 
@@ -512,6 +509,7 @@ void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                      int w, int h, int bd) {
   const InterpKernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
   (void)filter_x;
   (void)x_step_q4;
 
@@ -526,7 +524,6 @@ void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                             int h, int bd) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
   const InterpKernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
 
@@ -556,11 +553,12 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
                                 const int16_t *filter_y, int filter_y_stride,
                                 int w, int h, int bd) {
   int r;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
   (void)filter_x;
-  (void)filter_y;
   (void)filter_x_stride;
+  (void)filter_y;
   (void)filter_y_stride;
   (void)bd;
 
@@ -577,18 +575,17 @@ void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
                                const int16_t *filter_y, int filter_y_stride,
                                int w, int h, int bd) {
   int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
   (void)filter_x;
-  (void)filter_y;
   (void)filter_x_stride;
+  (void)filter_y;
   (void)filter_y_stride;
   (void)bd;
 
   for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-    }
+    for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
     src += src_stride;
     dst += dst_stride;
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
index 66062b6e78e..2909beb0f6c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
@@ -86,6 +86,10 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_8t_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_copy_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_avg_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve8_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_neon.c
 endif
 
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
@@ -159,6 +163,7 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_loopfilter_neon.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
@@ -199,27 +204,15 @@ DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes  += arm/idct4x4_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct8x8_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct16x16_1_add_neon$(ASM)
 DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct32x32_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct32x32_add_neon$(ASM)
 else
 ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes  += arm/idct4x4_1_add_neon.c
-DSP_SRCS-yes  += arm/idct4x4_add_neon.c
-DSP_SRCS-yes  += arm/idct8x8_1_add_neon.c
-DSP_SRCS-yes  += arm/idct8x8_add_neon.c
-DSP_SRCS-yes  += arm/idct16x16_1_add_neon.c
 DSP_SRCS-yes  += arm/idct16x16_add_neon.c
-DSP_SRCS-yes  += arm/idct32x32_1_add_neon.c
-DSP_SRCS-yes  += arm/idct32x32_add_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 DSP_SRCS-$(HAVE_NEON)  += arm/idct16x16_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/idct32x32_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/idct32x32_34_add_neon.c
 
 DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
@@ -233,7 +226,25 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
-endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # !CONFIG_VP9_HIGHBITDEPTH
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/idct_neon$(ASM)
+DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
+else
+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
+endif  # HAVE_NEON_ASM
+DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
+
 endif  # CONFIG_VP9
 
 # quantization
@@ -241,6 +252,7 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 
+DSP_SRCS-$(HAVE_SSE2)   += x86/fdct.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d148642e37b..ee403be3975 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -392,28 +392,28 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Sub Pixel Filters
   #
   add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_copy sse2/;
+  specialize qw/vpx_highbd_convolve_copy sse2 neon/;
 
   add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_avg sse2/;
+  specialize qw/vpx_highbd_convolve_avg sse2 neon/;
 
   add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8 neon/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_horiz/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_horiz neon/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_vert/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_vert neon/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_avg/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_avg neon/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_avg_horiz neon/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_avg_vert neon/, "$sse2_x86_64";
 }  # CONFIG_VP9_HIGHBITDEPTH
 
 #
@@ -457,40 +457,40 @@ specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_16 sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_16 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_16_dual sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_8 sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_8_dual sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_4 sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_4_dual sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_16 sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_16 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_16_dual sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_16_dual sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_8 sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_4 sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2 neon/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 
 #
@@ -637,26 +637,26 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
-    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
     add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
     add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
   } else {
     add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_16_add sse2/;
+    specialize qw/vpx_idct4x4_16_add neon sse2/;
 
     add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_1_add sse2/;
+    specialize qw/vpx_idct4x4_1_add neon sse2/;
 
     add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64";
+    specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64";
+    specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_1_add sse2/;
+    specialize qw/vpx_idct8x8_1_add neon sse2/;
 
     add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct16x16_256_add sse2/;
@@ -665,7 +665,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct16x16_10_add sse2/;
 
     add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_1_add sse2/;
+    specialize qw/vpx_idct16x16_1_add neon sse2/;
 
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64";
@@ -679,7 +679,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1_add sse2/;
+    specialize qw/vpx_idct32x32_1_add neon sse2/;
 
     add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct4x4_16_add sse2/;
@@ -687,8 +687,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct8x8_64_add sse2/;
 
-    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct8x8_10_add sse2/;
+    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_12_add sse2/;
 
     add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct16x16_256_add sse2/;
@@ -764,8 +764,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-    # Need to add 34 eob idct32x32 neon implementation.
-    $vpx_idct32x32_34_add_neon=vpx_idct32x32_1024_add_neon;
 
     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_filter.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_filter.h
index 6cea251bcca..26d690501b6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_filter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_filter.h
@@ -26,6 +26,17 @@ extern "C" {
 
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
+static INLINE const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static INLINE int get_filter_offset(const int16_t *f,
+                                    const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h
new file mode 100644
index 00000000000..54a6d81fcbc
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_FDCT_H_
+#define VPX_DSP_X86_FDCT_H_
+
+#include <xmmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then cast down.
+// This does not saturate values. It only truncates.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],
+                        (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],
+                        (int16_t)a[6], (int16_t)a[7]);
+#else
+  return _mm_load_si128((const __m128i *)a);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_hi = _mm_mulhi_epi16(a, one);
+  const __m128i a_lo = _mm_mullo_epi16(a, one);
+  const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+  const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+  _mm_store_si128((__m128i *)(b), a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+#else
+  _mm_store_si128((__m128i *)(b), a);
+#endif
+}
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+  const __m128i zero = _mm_setzero_si128();
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm_store_si128((__m128i *)(a), zero);
+  _mm_store_si128((__m128i *)(a + 4), zero);
+#else
+  _mm_store_si128((__m128i *)(a), zero);
+#endif
+}
+#endif  // VPX_DSP_X86_FDCT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 64b56223ede..2362476c1f1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -77,10 +77,10 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
       for (j = 0; j < 4; j++) {
         if (test & (1 << (4 * j))) {
           int k = 4 * i + j;
-          const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];
-          const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;
+          const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+          const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
           const uint32_t abs_qcoeff =
-              (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
+              (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
           qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
           dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
           if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index cb56ad0789c..d5fc1440c41 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -2379,7 +2379,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
 #define IDCT32_34                                                              \
   /* Stage1 */                                                                 \
   {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
     const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
     const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
                                                                                \
@@ -2404,7 +2403,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
                                                                                \
   /* Stage2 */                                                                 \
   {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
     const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
     const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
                                                                                \
@@ -2431,7 +2429,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
                                                                                \
   /* Stage3 */                                                                 \
   {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
     const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
     const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
                                                                                \
@@ -2472,7 +2469,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
                                                                                \
   /* Stage4 */                                                                 \
   {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
     const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
     const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
                                                                                \
@@ -3009,6 +3005,7 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
 // Only upper-left 8x8 has non-zero coeff
 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride) {
+  const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
 
@@ -3104,7 +3101,6 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
   col[31] = _mm_sub_epi16(stp1_0, stp1_31);
   for (i = 0; i < 4; i++) {
     int j;
-    const __m128i zero = _mm_setzero_si128();
     // Transpose 32x8 block to 8x32 block
     array_transpose_8x8(col + i * 8, in);
     IDCT32_34
@@ -3677,7 +3673,7 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8,
                                     int stride, int bd) {
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
@@ -4021,8 +4017,8 @@ void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   tran_low_t out;
 
-  out = highbd_dct_const_round_shift(input[0] * cospi_16_64);
-  out = highbd_dct_const_round_shift(out * cospi_16_64);
+  out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
   a = ROUND_POWER_OF_TWO(out, 6);
 
   d = _mm_set1_epi32(a);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c
index 2c7e431c745..0580a7bd7b6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -13,32 +13,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
-
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
-                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
-                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-#else
-  return _mm_load_si128((const __m128i *)coeff_ptr);
-#endif
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
-                                      tran_low_t *coeff_ptr) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  __m128i one = _mm_set1_epi16(1);
-  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
-  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
-  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
-  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
-  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-#else
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
-#endif
-}
+#include "vpx_dsp/x86/fdct.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,
@@ -81,8 +56,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         __m128i qtmp0, qtmp1;
         __m128i cmp_mask0, cmp_mask1;
         // Do DC and first 15 AC
-        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
-        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -117,15 +92,15 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
-        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -159,8 +134,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         __m128i qtmp0, qtmp1;
         __m128i cmp_mask0, cmp_mask1;
 
-        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
-        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -191,14 +166,14 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
-        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -237,10 +212,10 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     }
   } else {
     do {
-      store_coefficients(zero, dqcoeff_ptr + n_coeffs);
-      store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
-      store_coefficients(zero, qcoeff_ptr + n_coeffs);
-      store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
+      store_tran_low(zero, dqcoeff_ptr + n_coeffs);
+      store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
+      store_tran_low(zero, qcoeff_ptr + n_coeffs);
+      store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
       n_coeffs += 8 * 2;
     } while (n_coeffs < 0);
     *eob_ptr = 0;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index b26d97b4551..09c75d455ca 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -860,16 +860,6 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, const int16_t *filter_x,
                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c
index c94ed52d16d..a9be0868066 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c
@@ -76,38 +76,6 @@ void *vpx_calloc(size_t num, size_t size) {
   return x;
 }
 
-void *vpx_realloc(void *memblk, size_t size) {
-  void *new_addr = NULL;
-
-  /*
-  The realloc() function changes the size of the object pointed to by
-  ptr to the size specified by size, and returns a pointer to the
-  possibly moved block. The contents are unchanged up to the lesser
-  of the new and old sizes. If ptr is null, realloc() behaves like
-  malloc() for the specified size. If size is zero (0) and ptr is
-  not a null pointer, the object pointed to is freed.
-  */
-  if (!memblk)
-    new_addr = vpx_malloc(size);
-  else if (!size)
-    vpx_free(memblk);
-  else {
-    void *addr = get_actual_malloc_address(memblk);
-    const uint64_t aligned_size =
-        get_aligned_malloc_size(size, DEFAULT_ALIGNMENT);
-    if (!check_size_argument_overflow(1, aligned_size)) return NULL;
-
-    addr = realloc(addr, (size_t)aligned_size);
-    if (addr) {
-      new_addr = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE,
-                            DEFAULT_ALIGNMENT);
-      set_actual_malloc_address(new_addr, addr);
-    }
-  }
-
-  return new_addr;
-}
-
 void vpx_free(void *memblk) {
   if (memblk) {
     void *addr = get_actual_malloc_address(memblk);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h
index c14f288b895..733aff4885c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h
@@ -26,7 +26,6 @@ extern "C" {
 void *vpx_memalign(size_t align, size_t size);
 void *vpx_malloc(size_t size);
 void *vpx_calloc(size_t num, size_t size);
-void *vpx_realloc(void *memblk, size_t size);
 void vpx_free(void *memblk);
 
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vpxdec.c b/chromium/third_party/libvpx/source/libvpx/vpxdec.c
index d1ed3e6cae0..2cdb69d5a31 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpxdec.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpxdec.c
@@ -9,11 +9,11 @@
  */
 
 #include <assert.h>
+#include <limits.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <string.h>
-#include <limits.h>
 
 #include "./vpx_config.h"
 
@@ -92,31 +92,19 @@ static const arg_def_t md5arg =
 static const arg_def_t outbitdeptharg =
     ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
 #endif
-
-static const arg_def_t *all_args[] = { &codecarg,
-                                       &use_yv12,
-                                       &use_i420,
-                                       &flipuvarg,
-                                       &rawvideo,
-                                       &noblitarg,
-                                       &progressarg,
-                                       &limitarg,
-                                       &skiparg,
-                                       &postprocarg,
-                                       &summaryarg,
-                                       &outputfile,
-                                       &threadsarg,
-                                       &frameparallelarg,
-                                       &verbosearg,
-                                       &scalearg,
-                                       &fb_arg,
-                                       &md5arg,
-                                       &error_concealment,
-                                       &continuearg,
+static const arg_def_t svcdecodingarg = ARG_DEF(
+    NULL, "svc-decode-layer", 1, "Decode SVC stream up to given spatial layer");
+
+static const arg_def_t *all_args[] = {
+  &codecarg,       &use_yv12,    &use_i420,   &flipuvarg,         &rawvideo,
+  &noblitarg,      &progressarg, &limitarg,   &skiparg,           &postprocarg,
+  &summaryarg,     &outputfile,  &threadsarg, &frameparallelarg,  &verbosearg,
+  &scalearg,       &fb_arg,      &md5arg,     &error_concealment, &continuearg,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                       &outbitdeptharg,
+  &outbitdeptharg,
 #endif
-                                       NULL };
+  &svcdecodingarg, NULL
+};
 
 #if CONFIG_VP8_DECODER
 static const arg_def_t addnoise_level =
@@ -519,6 +507,8 @@ static int main_loop(int argc, const char **argv_) {
 #if CONFIG_VP9_HIGHBITDEPTH
   unsigned int output_bit_depth = 0;
 #endif
+  int svc_decoding = 0;
+  int svc_spatial_layer = 0;
 #if CONFIG_VP8_DECODER
   vp8_postproc_cfg_t vp8_pp_cfg = { 0, 0, 0 };
 #endif
@@ -610,6 +600,10 @@ static int main_loop(int argc, const char **argv_) {
       output_bit_depth = arg_parse_uint(&arg);
     }
 #endif
+    else if (arg_match(&arg, &svcdecodingarg, argi)) {
+      svc_decoding = 1;
+      svc_spatial_layer = arg_parse_uint(&arg);
+    }
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
       postproc = 1;
@@ -726,7 +720,14 @@ static int main_loop(int argc, const char **argv_) {
             vpx_codec_error(&decoder));
     goto fail2;
   }
-
+  if (svc_decoding) {
+    if (vpx_codec_control(&decoder, VP9_DECODE_SVC_SPATIAL_LAYER,
+                          svc_spatial_layer)) {
+      fprintf(stderr, "Failed to set spatial layer for svc decode: %s\n",
+              vpx_codec_error(&decoder));
+      goto fail;
+    }
+  }
   if (!quiet) fprintf(stderr, "%s\n", decoder.name);
 
 #if CONFIG_VP8_DECODER
@@ -780,8 +781,8 @@ static int main_loop(int argc, const char **argv_) {
           const char *detail = vpx_codec_error_detail(&decoder);
           warn("Failed to decode frame %d: %s", frame_in,
                vpx_codec_error(&decoder));
-
           if (detail) warn("Additional information: %s", detail);
+          corrupted = 1;
           if (!keep_going) goto fail;
         }
 
@@ -800,6 +801,8 @@ static int main_loop(int argc, const char **argv_) {
       // Flush the decoder in frame parallel decode.
       if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) {
         warn("Failed to flush decoder: %s", vpx_codec_error(&decoder));
+        corrupted = 1;
+        if (!keep_going) goto fail;
       }
     }
 
@@ -812,7 +815,7 @@ static int main_loop(int argc, const char **argv_) {
     vpx_usec_timer_mark(&timer);
     dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
 
-    if (!frame_parallel &&
+    if (!frame_parallel && !corrupted &&
         vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
       warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
       if (!keep_going) goto fail;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpxenc.c b/chromium/third_party/libvpx/source/libvpx/vpxenc.c
index 6e0af57a42c..a0f760574c8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpxenc.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpxenc.c
@@ -355,6 +355,8 @@ static const arg_def_t cq_level =
     ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level");
 static const arg_def_t max_intra_rate_pct =
     ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
+static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
+    NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
 
 #if CONFIG_VP8_ENCODER
 static const arg_def_t cpu_used_vp8 =
@@ -363,12 +365,21 @@ static const arg_def_t token_parts =
     ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2");
 static const arg_def_t screen_content_mode =
     ARG_DEF(NULL, "screen-content-mode", 1, "Screen content mode");
-static const arg_def_t *vp8_args[] = {
-  &cpu_used_vp8,        &auto_altref, &noise_sens,     &sharpness,
-  &static_thresh,       &token_parts, &arnr_maxframes, &arnr_strength,
-  &arnr_type,           &tune_ssim,   &cq_level,       &max_intra_rate_pct,
-  &screen_content_mode, NULL
-};
+static const arg_def_t *vp8_args[] = { &cpu_used_vp8,
+                                       &auto_altref,
+                                       &noise_sens,
+                                       &sharpness,
+                                       &static_thresh,
+                                       &token_parts,
+                                       &arnr_maxframes,
+                                       &arnr_strength,
+                                       &arnr_type,
+                                       &tune_ssim,
+                                       &cq_level,
+                                       &max_intra_rate_pct,
+                                       &gf_cbr_boost_pct,
+                                       &screen_content_mode,
+                                       NULL };
 static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
                                         VP8E_SET_ENABLEAUTOALTREF,
                                         VP8E_SET_NOISE_SENSITIVITY,
@@ -381,6 +392,7 @@ static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
                                         VP8E_SET_TUNING,
                                         VP8E_SET_CQ_LEVEL,
                                         VP8E_SET_MAX_INTRA_BITRATE_PCT,
+                                        VP8E_SET_GF_CBR_BOOST_PCT,
                                         VP8E_SET_SCREEN_CONTENT_MODE,
                                         0 };
 #endif
@@ -407,8 +419,6 @@ static const arg_def_t alt_ref_aq = ARG_DEF(NULL, "alt-ref-aq", 1,
 static const arg_def_t frame_periodic_boost =
     ARG_DEF(NULL, "frame-boost", 1,
             "Enable frame periodic boost (0: off (default), 1: on)");
-static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
-    NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
 static const arg_def_t max_inter_rate_pct =
     ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
 static const arg_def_t min_gf_interval = ARG_DEF(
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2017-03-08 10:28:10 +0100
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2017-03-20 13:40:30 +0000
commit	e733310db58160074f574c429d48f8308c0afe17 (patch)
tree	f8aef4b7e62a69928dbcf880620eece20f98c6df /chromium/third_party/libvpx/source/libvpx
parent	2f583e4aec1ae3a86fa047829c96b310dc12ecdf (diff)
download	qtwebengine-chromium-e733310db58160074f574c429d48f8308c0afe17.tar.gz